In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import statistics
#import missingno as msno

In [8]:
path_file = 'U1400.xlsx'
sheets_excel = pd.read_excel('U1400.xlsx',sheet_name=None)

In [9]:
for name in list(sheets_excel.keys()):
    sheets_excel[name].to_csv(name+'.csv', index=False)

### P4_S01

In [19]:
DF_P4_S1 = pd.read_csv('U1400P4S01.csv')

In [20]:
DF_P4_S1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12399 entries, 0 to 12398
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Address        12399 non-null  int64  
 1   member         12399 non-null  int64  
 2   employed_w     12399 non-null  int64  
 3   ISCO_w         12399 non-null  int64  
 4   ISIC_w         12399 non-null  int64  
 5   status_w       12399 non-null  int64  
 6   hours_w        11807 non-null  float64
 7   days_w         11809 non-null  float64
 8   income_w_m     12399 non-null  int64  
 9   income_w_y     12399 non-null  int64  
 10  wage_w_m       12396 non-null  float64
 11  wage_w_y       12398 non-null  float64
 12  perk_w_m       12398 non-null  float64
 13  perk_w_y       12398 non-null  float64
 14  netincome_w_m  12399 non-null  int64  
 15  netincome_w_y  12399 non-null  int64  
 16  Fasl           12399 non-null  int64  
 17  year           12399 non-null  int64  
 18  DYCOL0

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون DYcol00 کامل nan است.

In [21]:
DF_P4_S1.drop(['DYCOL00'], axis=1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
همبستگی ستون‌های income_w , wage_w, netincome_w مورد برسی قرار گرفت و چون همستگی ستون income در سال‌های دیگه کم بود فقط ستون wage_w حذف میشود.

In [22]:
DF_P4_S1[['income_w_m', 'wage_w_m', 'netincome_w_m']].corr()

Unnamed: 0,income_w_m,wage_w_m,netincome_w_m
income_w_m,1.0,0.421906,0.433007
wage_w_m,0.421906,1.0,0.982436
netincome_w_m,0.433007,0.982436,1.0


In [23]:
DF_P4_S1[['income_w_y', 'wage_w_y', 'netincome_w_y']].corr()

Unnamed: 0,income_w_y,wage_w_y,netincome_w_y
income_w_y,1.0,0.816534,0.827229
wage_w_y,0.816534,1.0,0.988077
netincome_w_y,0.827229,0.988077,1.0


In [24]:
DF_P4_S1.drop(['wage_w_m'], axis=1 , inplace= True)
DF_P4_S1.drop(['wage_w_y'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
با توجه به اطلاعات زیر ستون‌های employed_w, status_w, perk_w_m دارای imblance هستند بنابراین حذف می‌کنیم.

In [25]:
for col in DF_P4_S1.columns:
 print(f'{col :>15} {(DF_P4_S1[col].value_counts()/len(DF_P4_S1)).max():f}')

        Address 0.000565
         member 0.658843
     employed_w 0.935721
         ISCO_w 0.181950
         ISIC_w 0.183402
       status_w 0.769981
        hours_w 0.521171
         days_w 0.517945
     income_w_m 0.071538
     income_w_y 0.038148
       perk_w_m 0.937818
       perk_w_y 0.583434
  netincome_w_m 0.071619
  netincome_w_y 0.037342
           Fasl 0.253972
           year 1.000000


In [26]:
DF_P4_S1.drop(['employed_w', 'status_w', 'perk_w_m'], inplace=True, axis=1)

In [28]:
DF_P4_S1.isna().sum()

Address            0
member             0
ISCO_w             0
ISIC_w             0
hours_w          592
days_w           590
income_w_m         0
income_w_y         0
perk_w_y           1
netincome_w_m      0
netincome_w_y      0
Fasl               0
year               0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_w , hours_w با گروه بندی روی ISCO_w, ISIC_w پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [29]:
temp = DF_P4_S1.groupby(['ISCO_w', 'ISIC_w' ])[['hours_w','days_w']].transform('median')
#temp = np.floor(temp).astype(pd.Float64Dtype())
DF_P4_S1.loc[DF_P4_S1.hours_w.isna(), 'hours_w'] = temp.loc[DF_P4_S1.hours_w.isna(), 'hours_w']
DF_P4_S1.loc[DF_P4_S1.days_w.isna(), 'days_w'] = temp.loc[DF_P4_S1.days_w.isna(), 'days_w']

In [30]:
DF_P4_S1.isna().sum()

Address           0
member            0
ISCO_w            0
ISIC_w            0
hours_w          63
days_w           63
income_w_m        0
income_w_y        0
perk_w_y          1
netincome_w_m     0
netincome_w_y     0
Fasl              0
year              0
dtype: int64

In [31]:
DF_P4_S1.hours_w.fillna(DF_P4_S1.hours_w.median(),inplace= True)
DF_P4_S1.days_w.fillna(DF_P4_S1.days_w.median(), inplace= True)

In [32]:
DF_P4_S1.isna().sum()

Address          0
member           0
ISCO_w           0
ISIC_w           0
hours_w          0
days_w           0
income_w_m       0
income_w_y       0
perk_w_y         1
netincome_w_m    0
netincome_w_y    0
Fasl             0
year             0
dtype: int64

In [33]:
DF_P4_S1.dropna(inplace = True)

In [34]:
DF_P4_S1.isna().sum()

Address          0
member           0
ISCO_w           0
ISIC_w           0
hours_w          0
days_w           0
income_w_m       0
income_w_y       0
perk_w_y         0
netincome_w_m    0
netincome_w_y    0
Fasl             0
year             0
dtype: int64

In [36]:
path_file = 'U1400_P4_S01'
DF_P4_S1.to_csv(path_file+'.csv', index=False)

### P4_S02

In [83]:
DF_P4_S2 = pd.read_csv('U1400P4S02.csv')
DF_P4_S2.head()

Unnamed: 0,Address,member,employed_s,ISCO_s,ISIC_s,status_s,agriculture,hours_s,days_s,cost_employment,cost_raw,cost_machinery,cost_others,cost_tax,sale,income_s_y,Fasl,year,DYCOL00
0,10003003229,1,1,6111.0,1110.0,5,1,3.0,4.0,,35000000.0,,10000000.0,,75000000.0,30000000,1,1401,
1,10003003229,1,1,6121.0,1440.0,5,1,5.0,7.0,,40500000.0,,2000000.0,,86000000.0,43500000,1,1401,
2,10011009720,1,1,6121.0,1440.0,5,1,9.0,7.0,,72000000.0,,70000000.0,,350000000.0,208000000,1,1401,
3,10011009735,1,1,9129.0,96010.0,4,2,12.0,6.0,720000000.0,80000000.0,40000000.0,240000000.0,20000000.0,1580000000.0,480000000,1,1401,
4,10003003235,4,1,6111.0,1110.0,5,1,8.0,3.0,,85000000.0,3000000.0,,,250000000.0,162000000,1,1401,


In [84]:
DF_P4_S2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6376 entries, 0 to 6375
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          6376 non-null   int64  
 1   member           6376 non-null   int64  
 2   employed_s       6376 non-null   int64  
 3   ISCO_s           6375 non-null   float64
 4   ISIC_s           6375 non-null   float64
 5   status_s         6376 non-null   int64  
 6   agriculture      6376 non-null   int64  
 7   hours_s          6280 non-null   float64
 8   days_s           6280 non-null   float64
 9   cost_employment  5417 non-null   float64
 10  cost_raw         6086 non-null   float64
 11  cost_machinery   5790 non-null   float64
 12  cost_others      5954 non-null   float64
 13  cost_tax         5367 non-null   float64
 14  sale             6295 non-null   float64
 15  income_s_y       6376 non-null   int64  
 16  Fasl             6376 non-null   int64  
 17  year          

In [85]:
DF_P4_S2.drop(['DYCOL00'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های cost جمع می‌شوند و در ستون جدیدی به نام Total_cost ذخیره می‌شوند.

In [86]:
del_columns = ['cost_employment', 'cost_raw', 'cost_machinery', 'cost_others', 'cost_tax']
DF_P4_S2.insert(9,'Total_cost', DF_P4_S2.loc[:,'cost_employment':'cost_tax' ].sum(axis=1))
DF_P4_S2.drop(del_columns, axis=1, inplace=True)

In [87]:
DF_P4_S2.isna().sum()

Address         0
member          0
employed_s      0
ISCO_s          1
ISIC_s          1
status_s        0
agriculture     0
hours_s        96
days_s         96
Total_cost      0
sale           81
income_s_y      0
Fasl            0
year            0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_s , hours_s با گروه بندی روی ISCO_s, ISIC_s پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [88]:
temp = DF_P4_S2.groupby(['ISCO_s', 'ISIC_s' ])[['hours_s','days_s']].transform('median')

DF_P4_S2.loc[DF_P4_S2.hours_s.isna(), 'hours_s'] = temp.loc[DF_P4_S2.hours_s.isna(), 'hours_s']
DF_P4_S2.loc[DF_P4_S2.days_s.isna(), 'days_s'] = temp.loc[DF_P4_S2.days_s.isna(), 'days_s']

DF_P4_S2.hours_s.fillna(DF_P4_S2.hours_s.median(),inplace= True)
DF_P4_S2.days_s.fillna(DF_P4_S2.days_s.median(), inplace= True)

In [89]:
DF_P4_S2.isna().sum()

Address         0
member          0
employed_s      0
ISCO_s          1
ISIC_s          1
status_s        0
agriculture     0
hours_s         0
days_s          0
Total_cost      0
sale           81
income_s_y      0
Fasl            0
year            0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون sale
همان طور که در کد زیر نشان داده شده است تقریبن در 90 درصد موارد nan در ستون income_s_y مقدار0 دارد.

In [90]:
DF_P4_S2.dropna(inplace =True)

In [91]:
DF_P4_S2.isna().sum()

Address        0
member         0
employed_s     0
ISCO_s         0
ISIC_s         0
status_s       0
agriculture    0
hours_s        0
days_s         0
Total_cost     0
sale           0
income_s_y     0
Fasl           0
year           0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون employed_s به دلیل imblance حذف می‌کنیم.

In [92]:
for col in DF_P4_S2.columns:
 print(f'{col :>15} {(DF_P4_S2[col].value_counts()/len(DF_P4_S2)).max():f}')

        Address 0.001271
         member 0.785033
     employed_s 0.977280
         ISCO_s 0.193994
         ISIC_s 0.105021
       status_s 0.856212
    agriculture 0.774547
        hours_s 0.297267
         days_s 0.476486
     Total_cost 0.181125
           sale 0.058627
     income_s_y 0.054655
           Fasl 0.256911
           year 1.000000


In [93]:
DF_P4_S2.drop(['employed_s'],axis=1, inplace=True)

In [94]:
DF_P4_S2.isna().sum()

Address        0
member         0
ISCO_s         0
ISIC_s         0
status_s       0
agriculture    0
hours_s        0
days_s         0
Total_cost     0
sale           0
income_s_y     0
Fasl           0
year           0
dtype: int64

In [95]:
path_file = 'U1400_P4_S02'
DF_P4_S2.to_csv(path_file+'.csv', index=False)

### P4_S03

In [96]:
DF_P4_S3 = pd.read_csv('U1400P4S03.csv')
DF_P4_S3.head()

Unnamed: 0,Address,member,income_pension,income_rent,income_interest,income_aid,income_resale,income_transfer,Fasl,year,DYCOL00
0,10001000226,1.0,420000000.0,,,15400000.0,,,1,1401,
1,10003003229,1.0,,,2000000.0,20640000.0,,,1,1401,
2,10003003229,3.0,42000000.0,,,,,,1,1401,
3,10011009725,1.0,,,12360000.0,8000000.0,,,1,1401,
4,10011009720,1.0,,,16560000.0,10000000.0,,,1,1401,


In [97]:
DF_P4_S3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24345 entries, 0 to 24344
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          24345 non-null  int64  
 1   member           24344 non-null  float64
 2   income_pension   10004 non-null  object 
 3   income_rent      6438 non-null   float64
 4   income_interest  15468 non-null  float64
 5   income_aid       17483 non-null  float64
 6   income_resale    5837 non-null   float64
 7   income_transfer  8647 non-null   float64
 8   Fasl             24345 non-null  int64  
 9   year             24345 non-null  int64  
 10  DYCOL00          0 non-null      float64
dtypes: float64(7), int64(3), object(1)
memory usage: 2.0+ MB


In [98]:
DF_P4_S3.drop(['DYCOL00'], axis = 1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های income_pension  به float تغییر می‌دهیم.

In [99]:
DF_P4_S3.income_pension = DF_P4_S3.income_pension.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_pension.isin(['']) , 'income_pension' ] = np.nan
DF_P4_S3.income_pension = DF_P4_S3.income_pension.astype(pd.Float64Dtype())



<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های income جمع می‌شوند و در ستون جدیدی به نام Total_income ذخیره می‌شوند.

In [100]:
DF_P4_S3.insert(2,'Total_income', DF_P4_S3.loc[:,'income_pension':'income_transfer' ].sum(axis=1))
del_columns = ['income_pension','income_rent', 'income_interest','income_aid','income_resale','income_transfer']
DF_P4_S3.drop(del_columns,axis=1, inplace=True)

In [101]:
DF_P4_S3.isna().sum()

Address         0
member          1
Total_income    0
Fasl            0
year            0
dtype: int64

In [102]:
DF_P4_S3.dropna(inplace= True)

In [103]:
DF_P4_S3.isna().sum()

Address         0
member          0
Total_income    0
Fasl            0
year            0
dtype: int64

In [104]:
path_file = 'U1400_P4_S03'
DF_P4_S3.to_csv(path_file+'.csv', index=False)

### P4_S04

In [105]:
DF_P4_S4 = pd.read_csv('U1400P4S04.csv')
DF_P4_S4.head()

Unnamed: 0,Address,member,subsidy_number,subsidy_month,subsidy,Fasl,year,DYCOL00
0,10001000226,1,1.0,12.0,1960000,1,1401,
1,10003003229,1,4.0,12.0,21840000,1,1401,
2,10011009725,1,2.0,12.0,10920000,1,1401,
3,10011009720,1,1.0,12.0,1610000,1,1401,
4,10011009720,2,2.0,12.0,10920000,1,1401,


In [106]:
DF_P4_S4.drop(['DYCOL00'], axis = 1, inplace=True)

In [107]:
DF_P4_S4.isna().sum()

Address           0
member            0
subsidy_number    2
subsidy_month     2
subsidy           0
Fasl              0
year              0
dtype: int64

In [108]:
DF_P4_S4.dropna(inplace=True)

In [109]:
DF_P4_S4.isna().sum()

Address           0
member            0
subsidy_number    0
subsidy_month     0
subsidy           0
Fasl              0
year              0
dtype: int64

In [110]:
path_file = 'U1400_P4_S04'
DF_P4_S4.to_csv(path_file+'.csv', index=False)