In [239]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import statistics
#import missingno as msno

In [240]:
path_file = 'R1401.xlsx'
sheets_excel = pd.read_excel('R1401.xlsx',sheet_name=None)

In [241]:
for name in list(sheets_excel.keys()):
    sheets_excel[name].to_csv(name+'.csv', index=False)

### P4_S01

In [242]:
DF_P4_S1 = pd.read_csv('R1401P4S01.csv')

In [243]:
DF_P4_S1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10303 entries, 0 to 10302
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Address        10303 non-null  int64  
 1   member         10303 non-null  int64  
 2   employed_w     10303 non-null  int64  
 3   ISCO_w         10303 non-null  int64  
 4   ISIC_w         10302 non-null  float64
 5   status_w       10303 non-null  object 
 6   hours_w        10303 non-null  object 
 7   days_w         10303 non-null  object 
 8   income_w_m     10303 non-null  int64  
 9   income_w_y     10303 non-null  int64  
 10  wage_w_m       10300 non-null  float64
 11  wage_w_y       10299 non-null  float64
 12  perk_w_m       10301 non-null  float64
 13  perk_w_y       10302 non-null  float64
 14  netincome_w_m  10303 non-null  int64  
 15  netincome_w_y  10303 non-null  int64  
 16  Fasl           10303 non-null  int64  
 17  year           10303 non-null  int64  
 18  DYCOL0

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون DYcol00 کامل nan است.

In [244]:
DF_P4_S1.drop(['DYCOL00'], axis=1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون های days_w, hours_w , status_w به int تبدیل می‌کنیم

In [245]:
DF_P4_S1.loc[DF_P4_S1.days_w.isin([' ','  ']),['days_w' ]] = np.nan
DF_P4_S1.loc[DF_P4_S1.hours_w.isin([' ','  ']),['hours_w' ]] = np.nan
DF_P4_S1.loc[DF_P4_S1.status_w.isin([' ','  ']),['status_w' ]] = np.nan

DF_P4_S1.status_w= DF_P4_S1.status_w.astype(pd.Int64Dtype())
DF_P4_S1.days_w= DF_P4_S1.days_w.astype(pd.Int64Dtype())
DF_P4_S1.hours_w= DF_P4_S1.hours_w.astype(pd.Int64Dtype())

In [246]:
DF_P4_S1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10303 entries, 0 to 10302
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Address        10303 non-null  int64  
 1   member         10303 non-null  int64  
 2   employed_w     10303 non-null  int64  
 3   ISCO_w         10303 non-null  int64  
 4   ISIC_w         10302 non-null  float64
 5   status_w       10300 non-null  Int64  
 6   hours_w        9308 non-null   Int64  
 7   days_w         9308 non-null   Int64  
 8   income_w_m     10303 non-null  int64  
 9   income_w_y     10303 non-null  int64  
 10  wage_w_m       10300 non-null  float64
 11  wage_w_y       10299 non-null  float64
 12  perk_w_m       10301 non-null  float64
 13  perk_w_y       10302 non-null  float64
 14  netincome_w_m  10303 non-null  int64  
 15  netincome_w_y  10303 non-null  int64  
 16  Fasl           10303 non-null  int64  
 17  year           10303 non-null  int64  
dtypes: Int

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
همبستگی ستون‌های income_w , wage_w, netincome_w مورد برسی قرار گرفت و چون همستگی ستون income در سال‌های دیگه کم بود فقط ستون wage_w حذف میشود.

In [247]:
DF_P4_S1[['income_w_m', 'wage_w_m', 'netincome_w_m']].corr()

Unnamed: 0,income_w_m,wage_w_m,netincome_w_m
income_w_m,1.0,0.868651,0.925823
wage_w_m,0.868651,1.0,0.938779
netincome_w_m,0.925823,0.938779,1.0


In [248]:
DF_P4_S1[['income_w_y', 'wage_w_y', 'netincome_w_y']].corr()

Unnamed: 0,income_w_y,wage_w_y,netincome_w_y
income_w_y,1.0,0.813705,0.833815
wage_w_y,0.813705,1.0,0.978596
netincome_w_y,0.833815,0.978596,1.0


In [249]:
DF_P4_S1.drop(['wage_w_m'], axis=1 , inplace= True)
DF_P4_S1.drop(['wage_w_y'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
با توجه به اطلاعات زیر ستون‌های employed_w, status_w, perk_w_y, perk_w_m دارای imblance هستند بنابراین حذف می‌کنیم.

In [250]:
for col in DF_P4_S1.columns:
 print(f'{col :>15} {(DF_P4_S1[col].value_counts()/len(DF_P4_S1)).max():f}')

        Address 0.000582
         member 0.683587
     employed_w 0.903329
         ISCO_w 0.288945
         ISIC_w 0.276036
       status_w 0.898476
        hours_w 0.537125
         days_w 0.392216
     income_w_m 0.127827
     income_w_y 0.052315
       perk_w_m 0.954382
       perk_w_y 0.768514
  netincome_w_m 0.128312
  netincome_w_y 0.052800
           Fasl 0.259245
           year 1.000000


In [251]:
DF_P4_S1.drop(['employed_w', 'status_w', 'perk_w_y', 'perk_w_m'], inplace=True, axis=1)

In [252]:
DF_P4_S1.isna().sum()

Address            0
member             0
ISCO_w             0
ISIC_w             1
hours_w          995
days_w           995
income_w_m         0
income_w_y         0
netincome_w_m      0
netincome_w_y      0
Fasl               0
year               0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_w , hours_w با گروه بندی روی ISCO_w, ISIC_w پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [253]:
temp = DF_P4_S1.groupby(['ISCO_w', 'ISIC_w' ])[['hours_w','days_w']].transform('median').astype(pd.Int16Dtype())
DF_P4_S1.loc[DF_P4_S1.hours_w.isna(), 'hours_w'] = temp.loc[DF_P4_S1.hours_w.isna(), 'hours_w']
DF_P4_S1.loc[DF_P4_S1.days_w.isna(), 'days_w'] = temp.loc[DF_P4_S1.days_w.isna(), 'days_w']

In [254]:
DF_P4_S1.isna().sum()

Address           0
member            0
ISCO_w            0
ISIC_w            1
hours_w          49
days_w           50
income_w_m        0
income_w_y        0
netincome_w_m     0
netincome_w_y     0
Fasl              0
year              0
dtype: int64

In [255]:
DF_P4_S1.hours_w.fillna(DF_P4_S1.hours_w.median(),inplace= True)
DF_P4_S1.days_w.fillna(DF_P4_S1.days_w.median(), inplace= True)

In [256]:
DF_P4_S1.dropna(inplace=True)

In [257]:
DF_P4_S1.isna().sum()

Address          0
member           0
ISCO_w           0
ISIC_w           0
hours_w          0
days_w           0
income_w_m       0
income_w_y       0
netincome_w_m    0
netincome_w_y    0
Fasl             0
year             0
dtype: int64

In [259]:
path_file = 'R1401_P4_S01'
DF_P4_S1.to_csv(path_file+'.csv', index=False)

### P4_S02

In [260]:
DF_P4_S2 = pd.read_csv('R1401P4S02.csv')
DF_P4_S2.head()

Unnamed: 0,Address,member,employed_s,ISCO_s,ISIC_s,status_s,agriculture,hours_s,days_s,cost_employment,cost_raw,cost_machinery,cost_others,cost_tax,sale,income_s_y,Fasl,year,DYCOL00
0,20001384026,1,1,6112,1240,4,1,5,4,150000000,41000000,0,0,0,320000000,129000000.0,3,1401,
1,20001384038,1,1,6112,1240,4,1,8,5,60000000,62000000,0,15000000,0,200000000,63000000.0,3,1401,
2,20001385231,1,1,6121,1440,5,1,4,7,0,203000000,0,0,0,370000000,167000000.0,3,1401,
3,20001385231,3,1,6121,1440,6,1,4,7,0,0,0,0,0,0,,3,1401,
4,20001385231,4,1,6121,1440,6,1,4,7,0,0,0,0,0,0,,3,1401,


In [261]:
DF_P4_S2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11139 entries, 0 to 11138
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          11139 non-null  int64  
 1   member           11139 non-null  int64  
 2   employed_s       11139 non-null  int64  
 3   ISCO_s           11139 non-null  int64  
 4   ISIC_s           11139 non-null  int64  
 5   status_s         11139 non-null  int64  
 6   agriculture      11139 non-null  int64  
 7   hours_s          11139 non-null  object 
 8   days_s           11139 non-null  object 
 9   cost_employment  11139 non-null  int64  
 10  cost_raw         11139 non-null  int64  
 11  cost_machinery   11139 non-null  int64  
 12  cost_others      11139 non-null  int64  
 13  cost_tax         11139 non-null  int64  
 14  sale             11139 non-null  int64  
 15  income_s_y       11138 non-null  object 
 16  Fasl             11139 non-null  int64  
 17  year        

In [262]:
DF_P4_S2.drop(['DYCOL00'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون های days_w, hours_s , status_s, income_s_y به int تبدیل می‌کنیم

In [263]:
DF_P4_S2.hours_s.unique()

array(['5 ', '8 ', '4 ', '7 ', '9 ', '  ', '3 ', '1 ', '6 ', '10', '2 ',
       '12', '11', '18', '15', '13', '14', '17', '04', '16', '09', '08'],
      dtype=object)

In [264]:
DF_P4_S2.loc[DF_P4_S2.hours_s.isin(['  ']), 'hours_s'] = np.nan
DF_P4_S2.hours_s = DF_P4_S2.hours_s.astype(pd.Int64Dtype())

In [265]:
DF_P4_S2.days_s.unique()

array(['4', '5', '7', '6', ' ', '2', '3', '1'], dtype=object)

In [266]:
DF_P4_S2.loc[DF_P4_S2.days_s.isin([' ']), 'days_s'] = np.nan
DF_P4_S2.days_s = DF_P4_S2.days_s.astype(pd.Int64Dtype())

In [267]:
DF_P4_S2.income_s_y = DF_P4_S2.income_s_y.str.strip()

In [268]:
DF_P4_S2.loc[DF_P4_S2.income_s_y.isin(['']), 'income_s_y'] = np.nan
DF_P4_S2.income_s_y = DF_P4_S2.income_s_y.astype(pd.Int64Dtype())

In [269]:
DF_P4_S2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11139 entries, 0 to 11138
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Address          11139 non-null  int64
 1   member           11139 non-null  int64
 2   employed_s       11139 non-null  int64
 3   ISCO_s           11139 non-null  int64
 4   ISIC_s           11139 non-null  int64
 5   status_s         11139 non-null  int64
 6   agriculture      11139 non-null  int64
 7   hours_s          10887 non-null  Int64
 8   days_s           10884 non-null  Int64
 9   cost_employment  11139 non-null  int64
 10  cost_raw         11139 non-null  int64
 11  cost_machinery   11139 non-null  int64
 12  cost_others      11139 non-null  int64
 13  cost_tax         11139 non-null  int64
 14  sale             11139 non-null  int64
 15  income_s_y       10795 non-null  Int64
 16  Fasl             11139 non-null  int64
 17  year             11139 non-null  int64
dtypes: Int

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های cost جمع می‌شوند و در ستون جدیدی به نام Total_cost ذخیره می‌شوند.

In [270]:
del_columns = ['cost_employment', 'cost_raw', 'cost_machinery', 'cost_others', 'cost_tax']
DF_P4_S2.insert(9,'Total_cost', DF_P4_S2.loc[:,'cost_employment':'cost_tax' ].sum(axis=1))
DF_P4_S2.drop(del_columns, axis=1, inplace=True)

In [271]:
DF_P4_S2.isna().sum()

Address          0
member           0
employed_s       0
ISCO_s           0
ISIC_s           0
status_s         0
agriculture      0
hours_s        252
days_s         255
Total_cost       0
sale             0
income_s_y     344
Fasl             0
year             0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_s , hours_s با گروه بندی روی ISCO_s, ISIC_s پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [272]:
temp = DF_P4_S2.groupby(['ISCO_s', 'ISIC_s' ])[['hours_s','days_s']].transform('median')
temp = np.floor(temp).astype(pd.Int16Dtype())
DF_P4_S2.loc[DF_P4_S2.hours_s.isna(), 'hours_s'] = temp.loc[DF_P4_S2.hours_s.isna(), 'hours_s']
DF_P4_S2.loc[DF_P4_S2.days_s.isna(), 'days_s'] = temp.loc[DF_P4_S2.days_s.isna(), 'days_s']

DF_P4_S2.hours_s.fillna(DF_P4_S2.hours_s.median(),inplace= True)
DF_P4_S2.days_s.fillna(DF_P4_S2.days_s.median(), inplace= True)

In [273]:
DF_P4_S2.isna().sum()

Address          0
member           0
employed_s       0
ISCO_s           0
ISIC_s           0
status_s         0
agriculture      0
hours_s          0
days_s           0
Total_cost       0
sale             0
income_s_y     344
Fasl             0
year             0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون income_s_y
همان طور که در کد زیر نشان داده شده است تقریبن  در همه موارد nan در ستون status_s مقدار ۶ )کارکن فامیلی) دارد.

In [274]:
DF_P4_S2.loc[DF_P4_S2.income_s_y.isna()].status_s.value_counts()

status_s
6    338
5      6
Name: count, dtype: int64

In [275]:
DF_P4_S2.fillna(0, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون employed_s به دلیل imblance حذف می‌کنیم.

In [276]:
for col in DF_P4_S2.columns:
 print(f'{col :>15} {(DF_P4_S2[col].value_counts()/len(DF_P4_S2)).max():f}')

        Address 0.001077
         member 0.738127
     employed_s 0.976479
         ISCO_s 0.307299
         ISIC_s 0.229464
       status_s 0.763354
    agriculture 0.719185
        hours_s 0.195529
         days_s 0.415298
     Total_cost 0.194452
           sale 0.177215
     income_s_y 0.159260
           Fasl 0.264925
           year 1.000000


In [277]:
DF_P4_S2.drop(['employed_s'],axis=1, inplace=True)

In [278]:
DF_P4_S2.isna().sum()

Address        0
member         0
ISCO_s         0
ISIC_s         0
status_s       0
agriculture    0
hours_s        0
days_s         0
Total_cost     0
sale           0
income_s_y     0
Fasl           0
year           0
dtype: int64

In [294]:
path_file = 'R1401_P4_S02'
DF_P4_S2.to_csv(path_file+'.csv', index=False)

### P4_S03

In [283]:
DF_P4_S3 = pd.read_csv('R1401P4S03.csv')
DF_P4_S3.head()

Unnamed: 0,Address,member,income_pension,income_rent,income_interest,income_aid,income_resale,income_transfer,Fasl,year,DYCOL00
0,20001384026,1,0,0,54000000,67350000,0,0,3,1401,
1,20001384027,1,0,0,5000000,84350000,0,0,3,1401,
2,20001384031,1,0,0,4700000,74660000,0,0,3,1401,
3,20001384035,1,0,0,7500000,82040000,0,0,3,1401,
4,20001384038,1,0,0,17000000,129350000,0,0,3,1401,


In [284]:
DF_P4_S3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26941 entries, 0 to 26940
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          26941 non-null  int64  
 1   member           26941 non-null  int64  
 2   income_pension   26941 non-null  int64  
 3   income_rent      26941 non-null  int64  
 4   income_interest  26941 non-null  int64  
 5   income_aid       26941 non-null  int64  
 6   income_resale    26941 non-null  int64  
 7   income_transfer  26941 non-null  int64  
 8   Fasl             26941 non-null  int64  
 9   year             26941 non-null  int64  
 10  DYCOL00          0 non-null      float64
dtypes: float64(1), int64(10)
memory usage: 2.3 MB


In [285]:
DF_P4_S3.drop(['DYCOL00'], axis = 1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های income جمع می‌شوند و در ستون جدیدی به نام Total_income ذخیره می‌شوند.

In [286]:
DF_P4_S3.insert(2,'Total_income', DF_P4_S3.loc[:,'income_pension':'income_transfer' ].sum(axis=1))
del_columns = ['income_pension','income_rent', 'income_interest','income_aid','income_resale','income_transfer']
DF_P4_S3.drop(del_columns,axis=1, inplace=True)

In [287]:
DF_P4_S3.isna().sum()

Address         0
member          0
Total_income    0
Fasl            0
year            0
dtype: int64

In [295]:
path_file = 'R1401_P4_S03'
DF_P4_S3.to_csv(path_file+'.csv', index=False)

### P4_S04

In [290]:
DF_P4_S4 = pd.read_csv('R1401P4S04.csv')
DF_P4_S4.head()

Unnamed: 0,Address,member,subsidy_number,subsidy_month,subsidy,Fasl,year,DYCOL00
0,20001384026,1,1,7,735000,3,1401,
1,20001384026,2,1,7,3185000,3,1401,
2,20001384027,1,1,7,735000,3,1401,
3,20001384027,2,4,7,12740000,3,1401,
4,20001384031,1,1,7,735000,3,1401,


In [291]:
DF_P4_S4.drop(['DYCOL00'], axis = 1, inplace=True)

In [292]:
DF_P4_S4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30755 entries, 0 to 30754
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Address         30755 non-null  int64
 1   member          30755 non-null  int64
 2   subsidy_number  30755 non-null  int64
 3   subsidy_month   30755 non-null  int64
 4   subsidy         30755 non-null  int64
 5   Fasl            30755 non-null  int64
 6   year            30755 non-null  int64
dtypes: int64(7)
memory usage: 1.6 MB


In [296]:
path_file = 'R1401_P4_S04'
DF_P4_S4.to_csv(path_file+'.csv', index=False)