In [140]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import statistics
#import missingno as msno

In [141]:
path_file = 'R98.xlsx'
sheets_excel = pd.read_excel('R98.xlsx',sheet_name=None)

In [142]:
for name in list(sheets_excel.keys()):
    sheets_excel[name].to_csv(name+'.csv', index=False)

### P4_S01

In [143]:
DF_P4_S1 = pd.read_csv('R1399P4S01.csv')

In [144]:
DF_P4_S1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10820 entries, 0 to 10819
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Address        10820 non-null  int64  
 1   member         10820 non-null  int64  
 2   employed_w     10820 non-null  int64  
 3   ISCO_w         10820 non-null  int64  
 4   ISIC_w         10820 non-null  int64  
 5   status_w       10820 non-null  int64  
 6   hours_w        9518 non-null   float64
 7   days_w         9517 non-null   float64
 8   income_w_m     10820 non-null  int64  
 9   income_w_y     10820 non-null  int64  
 10  wage_w_m       10814 non-null  float64
 11  wage_w_y       10817 non-null  float64
 12  perk_w_m       10812 non-null  float64
 13  perk_w_y       10817 non-null  float64
 14  netincome_w_m  10820 non-null  int64  
 15  netincome_w_y  10820 non-null  int64  
 16  Fasl           10820 non-null  int64  
 17  year           10820 non-null  int64  
 18  DYCOL0

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون DYcol00 کامل nan است.

In [145]:
DF_P4_S1.drop(['DYCOL00'], axis=1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
همبستگی ستون‌های income_w , wage_w, netincome_w مورد برسی قرار گرفت و چون همستگی ستون income در سال‌های دیگه کم بود فقط ستون wage_w حذف میشود.

In [146]:
DF_P4_S1[['income_w_m', 'wage_w_m', 'netincome_w_m']].corr()

Unnamed: 0,income_w_m,wage_w_m,netincome_w_m
income_w_m,1.0,0.799613,0.826813
wage_w_m,0.799613,1.0,0.973612
netincome_w_m,0.826813,0.973612,1.0


In [147]:
DF_P4_S1[['income_w_y', 'wage_w_y', 'netincome_w_y']].corr()

Unnamed: 0,income_w_y,wage_w_y,netincome_w_y
income_w_y,1.0,0.952265,0.965979
wage_w_y,0.952265,1.0,0.984304
netincome_w_y,0.965979,0.984304,1.0


In [148]:
DF_P4_S1.drop(['wage_w_m'], axis=1 , inplace= True)
DF_P4_S1.drop(['wage_w_y'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
با توجه به اطلاعات زیر ستون‌های employed_w, status_w, perk_w_y, perk_w_m دارای imblance هستند بنابراین حذف می‌کنیم.

In [149]:
for col in DF_P4_S1.columns:
 print(f'{col :>15} {(DF_P4_S1[col].value_counts()/len(DF_P4_S1)).max():f}')

        Address 0.000647
         member 0.668854
     employed_w 0.863124
         ISCO_w 0.281516
         ISIC_w 0.271442
       status_w 0.899815
        hours_w 0.514048
         days_w 0.344177
     income_w_m 0.162384
     income_w_y 0.053604
       perk_w_m 0.955176
       perk_w_y 0.789187
  netincome_w_m 0.162754
  netincome_w_y 0.053235
           Fasl 0.261275
           year 1.000000


In [150]:
DF_P4_S1.drop(['employed_w', 'status_w', 'perk_w_y', 'perk_w_m'], inplace=True, axis=1)

In [151]:
DF_P4_S1.isna().sum()

Address             0
member              0
ISCO_w              0
ISIC_w              0
hours_w          1302
days_w           1303
income_w_m          0
income_w_y          0
netincome_w_m       0
netincome_w_y       0
Fasl                0
year                0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_w , hours_w با گروه بندی روی ISCO_w, ISIC_w پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [152]:
temp = DF_P4_S1.groupby(['ISCO_w', 'ISIC_w' ])[['hours_w','days_w']].transform('median')
#temp = np.floor(temp).astype(pd.Float64Dtype())
DF_P4_S1.loc[DF_P4_S1.hours_w.isna(), 'hours_w'] = temp.loc[DF_P4_S1.hours_w.isna(), 'hours_w']
DF_P4_S1.loc[DF_P4_S1.days_w.isna(), 'days_w'] = temp.loc[DF_P4_S1.days_w.isna(), 'days_w']

In [153]:
DF_P4_S1.isna().sum()

Address           0
member            0
ISCO_w            0
ISIC_w            0
hours_w          67
days_w           67
income_w_m        0
income_w_y        0
netincome_w_m     0
netincome_w_y     0
Fasl              0
year              0
dtype: int64

In [154]:
DF_P4_S1.hours_w.fillna(DF_P4_S1.hours_w.median(),inplace= True)
DF_P4_S1.days_w.fillna(DF_P4_S1.days_w.median(), inplace= True)

In [155]:
DF_P4_S1.isna().sum()

Address          0
member           0
ISCO_w           0
ISIC_w           0
hours_w          0
days_w           0
income_w_m       0
income_w_y       0
netincome_w_m    0
netincome_w_y    0
Fasl             0
year             0
dtype: int64

In [156]:
path_file = 'R1398_P4_S01'
DF_P4_S1.to_csv(path_file+'.csv', index=False)

### P4_S02

In [157]:
DF_P4_S2 = pd.read_csv('R1399P4S02.csv')
DF_P4_S2.head()

Unnamed: 0,Address,member,employed_s,ISCO_s,ISIC_s,status_s,agriculture,hours_s,days_s,cost_employment,cost_raw,cost_machinery,cost_others,cost_tax,sale,income_s_y,Fasl,year,DYCOL00
0,20011395420,1,1,6121,1440,5,1,5.0,7.0,20000000.0,50000000.0,,,,142500000.0,72500000,1,98,
1,20001385423,1,1,6111,1110,5,1,6.0,5.0,,21500000.0,,21500000.0,,113000000.0,70000000,1,98,
2,20011395407,1,1,6111,1110,5,1,7.0,5.0,,76500000.0,,,,185000000.0,108500000,1,98,
3,20011395407,1,1,6121,1440,5,1,4.0,7.0,,105000000.0,,,,205000000.0,100000000,1,98,
4,20004387823,1,1,8322,49230,5,2,8.0,2.0,,25000000.0,20000000.0,25000000.0,,190000000.0,120000000,1,98,


In [158]:
DF_P4_S2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14339 entries, 0 to 14338
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          14339 non-null  int64  
 1   member           14339 non-null  int64  
 2   employed_s       14339 non-null  int64  
 3   ISCO_s           14339 non-null  int64  
 4   ISIC_s           14339 non-null  int64  
 5   status_s         14339 non-null  int64  
 6   agriculture      14339 non-null  int64  
 7   hours_s          14086 non-null  float64
 8   days_s           14086 non-null  float64
 9   cost_employment  12935 non-null  float64
 10  cost_raw         14035 non-null  float64
 11  cost_machinery   13085 non-null  float64
 12  cost_others      13606 non-null  float64
 13  cost_tax         12548 non-null  float64
 14  sale             14108 non-null  float64
 15  income_s_y       14339 non-null  int64  
 16  Fasl             14339 non-null  int64  
 17  year        

In [159]:
DF_P4_S2.drop(['DYCOL00'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های cost جمع می‌شوند و در ستون جدیدی به نام Total_cost ذخیره می‌شوند.

In [160]:
del_columns = ['cost_employment', 'cost_raw', 'cost_machinery', 'cost_others', 'cost_tax']
DF_P4_S2.insert(9,'Total_cost', DF_P4_S2.loc[:,'cost_employment':'cost_tax' ].sum(axis=1))
DF_P4_S2.drop(del_columns, axis=1, inplace=True)

In [161]:
DF_P4_S2.isna().sum()

Address          0
member           0
employed_s       0
ISCO_s           0
ISIC_s           0
status_s         0
agriculture      0
hours_s        253
days_s         253
Total_cost       0
sale           231
income_s_y       0
Fasl             0
year             0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_s , hours_s با گروه بندی روی ISCO_s, ISIC_s پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [162]:
temp = DF_P4_S2.groupby(['ISCO_s', 'ISIC_s' ])[['hours_s','days_s']].transform('median')
#temp = np.floor(temp).astype(pd.Int16Dtype())
DF_P4_S2.loc[DF_P4_S2.hours_s.isna(), 'hours_s'] = temp.loc[DF_P4_S2.hours_s.isna(), 'hours_s']
DF_P4_S2.loc[DF_P4_S2.days_s.isna(), 'days_s'] = temp.loc[DF_P4_S2.days_s.isna(), 'days_s']

DF_P4_S2.hours_s.fillna(DF_P4_S2.hours_s.median(),inplace= True)
DF_P4_S2.days_s.fillna(DF_P4_S2.days_s.median(), inplace= True)

In [163]:
DF_P4_S2.isna().sum()

Address          0
member           0
employed_s       0
ISCO_s           0
ISIC_s           0
status_s         0
agriculture      0
hours_s          0
days_s           0
Total_cost       0
sale           231
income_s_y       0
Fasl             0
year             0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون sale
همان طور که در کد زیر نشان داده شده است تقریبن در 100 درصد موارد nan در ستون income_s_y مقدار0 دارد.

In [164]:
DF_P4_S2.loc[DF_P4_S2.sale.isna()].income_s_y.value_counts() / DF_P4_S2.sale.isna().sum() * 100

income_s_y
0            99.5671
240000000     0.4329
Name: count, dtype: float64

In [165]:
DF_P4_S2.loc[DF_P4_S2.sale.isna()].status_s.value_counts()

status_s
6    230
5      1
Name: count, dtype: int64

In [166]:
DF_P4_S2.fillna(0, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون employed_s به دلیل imblance حذف می‌کنیم.

In [167]:
for col in DF_P4_S2.columns:
 print(f'{col :>15} {(DF_P4_S2[col].value_counts()/len(DF_P4_S2)).max():f}')

        Address 0.000767
         member 0.702769
     employed_s 0.978171
         ISCO_s 0.347932
         ISIC_s 0.259642
       status_s 0.711207
    agriculture 0.766092
        hours_s 0.166678
         days_s 0.479601
     Total_cost 0.230281
           sale 0.214869
     income_s_y 0.202594
           Fasl 0.264523
           year 1.000000


In [168]:
DF_P4_S2.drop(['employed_s'],axis=1, inplace=True)

In [169]:
DF_P4_S2.isna().sum()

Address        0
member         0
ISCO_s         0
ISIC_s         0
status_s       0
agriculture    0
hours_s        0
days_s         0
Total_cost     0
sale           0
income_s_y     0
Fasl           0
year           0
dtype: int64

In [170]:
path_file = 'R1398_P4_S02'
DF_P4_S2.to_csv(path_file+'.csv', index=False)

### P4_S03

In [171]:
DF_P4_S3 = pd.read_csv('R1399P4S03.csv')
DF_P4_S3.head()

Unnamed: 0,Address,member,income_pension,income_rent,income_interest,income_aid,income_resale,income_transfer,Fasl,year,DYCOL00
0,20001385412,1,,24600000.0,,,,,1,98,
1,20001385423,1,154000000.0,,,,,,1,98,
2,20001385420,1,130000000.0,,,,,,1,98,
3,20011395423,1,,,,,5000000.0,5000000.0,1,98,
4,20011395416,1,,80000000.0,,12000000.0,,,1,98,


In [172]:
DF_P4_S3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18827 entries, 0 to 18826
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          18827 non-null  int64  
 1   member           18827 non-null  int64  
 2   income_pension   6193 non-null   object 
 3   income_rent      5491 non-null   object 
 4   income_interest  13222 non-null  float64
 5   income_aid       10355 non-null  float64
 6   income_resale    4873 non-null   float64
 7   income_transfer  8242 non-null   float64
 8   Fasl             18827 non-null  int64  
 9   year             18827 non-null  int64  
 10  DYCOL00          0 non-null      float64
dtypes: float64(5), int64(4), object(2)
memory usage: 1.6+ MB


In [173]:
DF_P4_S3.drop(['DYCOL00'], axis = 1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های income_pension , income_rent به float تغییر می‌دهیم.

In [174]:
DF_P4_S3.income_pension = DF_P4_S3.income_pension.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_pension.isin(['']) , 'income_pension' ] = np.nan
DF_P4_S3.income_pension = DF_P4_S3.income_pension.astype(pd.Float64Dtype())

DF_P4_S3.income_rent = DF_P4_S3.income_rent.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_rent.isin(['']) , 'income_rent' ] = np.nan
DF_P4_S3.income_rent = DF_P4_S3.income_rent.astype(pd.Float64Dtype())


<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های income جمع می‌شوند و در ستون جدیدی به نام Total_income ذخیره می‌شوند.

In [175]:
DF_P4_S3.insert(2,'Total_income', DF_P4_S3.loc[:,'income_pension':'income_transfer' ].sum(axis=1))
del_columns = ['income_pension','income_rent', 'income_interest','income_aid','income_resale','income_transfer']
DF_P4_S3.drop(del_columns,axis=1, inplace=True)

In [176]:
DF_P4_S3.isna().sum()

Address         0
member          0
Total_income    0
Fasl            0
year            0
dtype: int64

In [177]:
path_file = 'R1398_P4_S03'
DF_P4_S3.to_csv(path_file+'.csv', index=False)

### P4_S04

In [178]:
DF_P4_S4 = pd.read_csv('R1399P4S04.csv')
DF_P4_S4.head()

Unnamed: 0,Address,member,subsidy_number,subsidy_month,subsidy,Fasl,year,DYCOL00
0,20011395420,1,3,12,16380000,1,98,
1,20001385412,1,4,12,21840000,1,98,
2,20001385423,1,2,12,10920000,1,98,
3,20001385420,1,1,12,5460000,1,98,
4,20011395407,1,4,12,21840000,1,98,


In [179]:
DF_P4_S4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19610 entries, 0 to 19609
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Address         19610 non-null  int64  
 1   member          19610 non-null  int64  
 2   subsidy_number  19610 non-null  int64  
 3   subsidy_month   19610 non-null  int64  
 4   subsidy         19610 non-null  int64  
 5   Fasl            19610 non-null  int64  
 6   year            19610 non-null  int64  
 7   DYCOL00         0 non-null      float64
dtypes: float64(1), int64(7)
memory usage: 1.2 MB


In [180]:
DF_P4_S4.drop(['DYCOL00'], axis = 1, inplace=True)

In [181]:
DF_P4_S4.isna().sum()

Address           0
member            0
subsidy_number    0
subsidy_month     0
subsidy           0
Fasl              0
year              0
dtype: int64

In [182]:
path_file = 'R1398_P4_S04'
DF_P4_S4.to_csv(path_file+'.csv', index=False)