In [263]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import statistics
#import missingno as msno

In [264]:
path_file = 'R99.xlsx'
sheets_excel = pd.read_excel('R99.xlsx',sheet_name=None)

In [265]:
for name in list(sheets_excel.keys()):
    sheets_excel[name].to_csv(name+'.csv', index=False)

### P4_S01

In [266]:
DF_P4_S1 = pd.read_csv('R99P4S01.csv')

In [267]:
DF_P4_S1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10887 entries, 0 to 10886
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Address        10887 non-null  int64  
 1   member         10887 non-null  int64  
 2   employed_w     10887 non-null  int64  
 3   ISCO_w         10887 non-null  int64  
 4   ISIC_w         10887 non-null  int64  
 5   status_w       10887 non-null  int64  
 6   hours_w        9691 non-null   float64
 7   days_w         9691 non-null   float64
 8   income_w_m     10887 non-null  int64  
 9   income_w_y     10887 non-null  int64  
 10  wage_w_m       10873 non-null  float64
 11  wage_w_y       10886 non-null  float64
 12  perk_w_m       10878 non-null  float64
 13  perk_w_y       10885 non-null  float64
 14  netincome_w_m  10887 non-null  int64  
 15  netincome_w_y  10887 non-null  int64  
 16  Fasl           10887 non-null  int64  
 17  year           10887 non-null  int64  
 18  DYCOL0

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون DYcol00 کامل nan است.

In [268]:
DF_P4_S1.drop(['DYCOL00'], axis=1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
همبستگی ستون‌های income_w , wage_w, netincome_w مورد برسی قرار گرفت و چون همستگی ستون income در سال‌های دیگه کم بود فقط ستون wage_w حذف میشود.

In [269]:
DF_P4_S1[['income_w_m', 'wage_w_m', 'netincome_w_m']].corr()

Unnamed: 0,income_w_m,wage_w_m,netincome_w_m
income_w_m,1.0,0.401919,0.406996
wage_w_m,0.401919,1.0,0.986679
netincome_w_m,0.406996,0.986679,1.0


In [270]:
DF_P4_S1[['income_w_y', 'wage_w_y', 'netincome_w_y']].corr()

Unnamed: 0,income_w_y,wage_w_y,netincome_w_y
income_w_y,1.0,0.878913,0.887957
wage_w_y,0.878913,1.0,0.994099
netincome_w_y,0.887957,0.994099,1.0


In [271]:
DF_P4_S1.drop(['wage_w_m'], axis=1 , inplace= True)
DF_P4_S1.drop(['wage_w_y'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
با توجه به اطلاعات زیر ستون‌های employed_w, status_w, perk_w_y, perk_w_m دارای imblance هستند بنابراین حذف می‌کنیم.

In [272]:
for col in DF_P4_S1.columns:
 print(f'{col :>15} {(DF_P4_S1[col].value_counts()/len(DF_P4_S1)).max():f}')

        Address 0.000735
         member 0.693488
     employed_w 0.862405
         ISCO_w 0.302838
         ISIC_w 0.296776
       status_w 0.904473
        hours_w 0.512538
         days_w 0.352714
     income_w_m 0.164876
     income_w_y 0.046937
       perk_w_m 0.957840
       perk_w_y 0.789749
  netincome_w_m 0.165151
  netincome_w_y 0.046569
           Fasl 0.256361
           year 1.000000


In [273]:
DF_P4_S1.drop(['employed_w', 'status_w', 'perk_w_y', 'perk_w_m'], inplace=True, axis=1)

In [274]:
DF_P4_S1.isna().sum()

Address             0
member              0
ISCO_w              0
ISIC_w              0
hours_w          1196
days_w           1196
income_w_m          0
income_w_y          0
netincome_w_m       0
netincome_w_y       0
Fasl                0
year                0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_w , hours_w با گروه بندی روی ISCO_w, ISIC_w پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [275]:
temp = DF_P4_S1.groupby(['ISCO_w', 'ISIC_w' ])[['hours_w','days_w']].transform('median')
#temp = np.floor(temp).astype(pd.Float64Dtype())
DF_P4_S1.loc[DF_P4_S1.hours_w.isna(), 'hours_w'] = temp.loc[DF_P4_S1.hours_w.isna(), 'hours_w']
DF_P4_S1.loc[DF_P4_S1.days_w.isna(), 'days_w'] = temp.loc[DF_P4_S1.days_w.isna(), 'days_w']

In [276]:
DF_P4_S1.isna().sum()

Address           0
member            0
ISCO_w            0
ISIC_w            0
hours_w          59
days_w           59
income_w_m        0
income_w_y        0
netincome_w_m     0
netincome_w_y     0
Fasl              0
year              0
dtype: int64

In [277]:
DF_P4_S1.hours_w.fillna(DF_P4_S1.hours_w.median(),inplace= True)
DF_P4_S1.days_w.fillna(DF_P4_S1.days_w.median(), inplace= True)

In [278]:
DF_P4_S1.isna().sum()

Address          0
member           0
ISCO_w           0
ISIC_w           0
hours_w          0
days_w           0
income_w_m       0
income_w_y       0
netincome_w_m    0
netincome_w_y    0
Fasl             0
year             0
dtype: int64

In [279]:
path_file = 'R1399_P4_S01'
DF_P4_S1.to_csv(path_file+'.csv', index=False)

### P4_S02

In [231]:
DF_P4_S2 = pd.read_csv('R99P4S02.csv')
DF_P4_S2.head()

Unnamed: 0,Address,member,employed_s,ISCO_s,ISIC_s,status_s,agriculture,hours_s,days_s,cost_employment,cost_raw,cost_machinery,cost_others,cost_tax,sale,income_s_y,Fasl,year,DYCOL00
0,20011394626,3.0,1.0,6111.0,1110.0,5.0,1,6.0,3.0,10000000.0,62000000.0,,60000000.0,,260000000.0,128000000.0,1,1399,
1,20011394626,3.0,1.0,6121.0,1440.0,5.0,1,4.0,7.0,8000000.0,200000000.0,,,,295000000.0,87000000.0,1,1399,
2,20011394623,1.0,1.0,6121.0,1440.0,5.0,1,8.0,7.0,1800000.0,70650000.0,,,,124000000.0,51550000.0,1,1399,
3,20011394614,1.0,1.0,6111.0,1110.0,5.0,1,6.0,3.0,,54000000.0,,70000000.0,,380000000.0,256000000.0,1,1399,
4,20011394614,1.0,1.0,6121.0,1440.0,5.0,1,6.0,7.0,,50000000.0,,,,110000000.0,60000000.0,1,1399,


In [232]:
DF_P4_S2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12946 entries, 0 to 12945
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          12946 non-null  int64  
 1   member           12945 non-null  float64
 2   employed_s       12945 non-null  float64
 3   ISCO_s           12943 non-null  float64
 4   ISIC_s           12943 non-null  float64
 5   status_s         12945 non-null  float64
 6   agriculture      12946 non-null  int64  
 7   hours_s          12776 non-null  float64
 8   days_s           12782 non-null  float64
 9   cost_employment  11511 non-null  float64
 10  cost_raw         12618 non-null  float64
 11  cost_machinery   11583 non-null  float64
 12  cost_others      12199 non-null  float64
 13  cost_tax         11137 non-null  float64
 14  sale             12729 non-null  float64
 15  income_s_y       12945 non-null  float64
 16  Fasl             12946 non-null  int64  
 17  year        

In [233]:
DF_P4_S2.drop(['DYCOL00'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های cost جمع می‌شوند و در ستون جدیدی به نام Total_cost ذخیره می‌شوند.

In [234]:
del_columns = ['cost_employment', 'cost_raw', 'cost_machinery', 'cost_others', 'cost_tax']
DF_P4_S2.insert(9,'Total_cost', DF_P4_S2.loc[:,'cost_employment':'cost_tax' ].sum(axis=1))
DF_P4_S2.drop(del_columns, axis=1, inplace=True)

In [235]:
DF_P4_S2.isna().sum()

Address          0
member           1
employed_s       1
ISCO_s           3
ISIC_s           3
status_s         1
agriculture      0
hours_s        170
days_s         164
Total_cost       0
sale           217
income_s_y       1
Fasl             0
year             0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_s , hours_s با گروه بندی روی ISCO_s, ISIC_s پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [236]:
temp = DF_P4_S2.groupby(['ISCO_s', 'ISIC_s' ])[['hours_s','days_s']].transform('median')
temp = np.floor(temp).astype(pd.Int16Dtype())
DF_P4_S2.loc[DF_P4_S2.hours_s.isna(), 'hours_s'] = temp.loc[DF_P4_S2.hours_s.isna(), 'hours_s']
DF_P4_S2.loc[DF_P4_S2.days_s.isna(), 'days_s'] = temp.loc[DF_P4_S2.days_s.isna(), 'days_s']

DF_P4_S2.hours_s.fillna(DF_P4_S2.hours_s.median(),inplace= True)
DF_P4_S2.days_s.fillna(DF_P4_S2.days_s.median(), inplace= True)

[   9,    9,    8,    8,    6,    6,    8,    5,    5,    8,
 ...
    5, <NA>,    8,    8,    8,    9,    8,    6,    8,    6]
Length: 170, dtype: Int16' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  DF_P4_S2.loc[DF_P4_S2.hours_s.isna(), 'hours_s'] = temp.loc[DF_P4_S2.hours_s.isna(), 'hours_s']
[   5,    5,    6,    6,    6,    6,    6,    6,    6,    6,
 ...
    4, <NA>,    5,    5,    6,    6,    6,    4,    4,    4]
Length: 164, dtype: Int16' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  DF_P4_S2.loc[DF_P4_S2.days_s.isna(), 'days_s'] = temp.loc[DF_P4_S2.days_s.isna(), 'days_s']


In [237]:
DF_P4_S2.isna().sum()

Address          0
member           1
employed_s       1
ISCO_s           3
ISIC_s           3
status_s         1
agriculture      0
hours_s          0
days_s           0
Total_cost       0
sale           217
income_s_y       1
Fasl             0
year             0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون sale
همان طور که در کد زیر نشان داده شده است تقریبن در 100 درصد موارد nan در ستون income_s_y مقدار0 دارد.

In [238]:
DF_P4_S2.loc[DF_P4_S2.sale.isna()].income_s_y.value_counts() / DF_P4_S2.sale.isna().sum() * 100

income_s_y
0.000000e+00    98.617512
1.200000e+09     0.460829
8.400000e+08     0.460829
Name: count, dtype: float64

In [239]:
DF_P4_S2.loc[DF_P4_S2.sale.isna()].status_s.value_counts()

status_s
6.0    214
5.0      1
4.0      1
Name: count, dtype: int64

In [240]:
DF_P4_S2.fillna(0, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون employed_s به دلیل imblance حذف می‌کنیم.

In [241]:
for col in DF_P4_S2.columns:
 print(f'{col :>15} {(DF_P4_S2[col].value_counts()/len(DF_P4_S2)).max():f}')

        Address 0.000850
         member 0.701375
     employed_s 0.979453
         ISCO_s 0.356172
         ISIC_s 0.261162
       status_s 0.709254
    agriculture 0.771821
        hours_s 0.174803
         days_s 0.449019
     Total_cost 0.239456
           sale 0.219913
     income_s_y 0.209486
           Fasl 0.261471
           year 1.000000


In [242]:
DF_P4_S2.drop(['employed_s'],axis=1, inplace=True)

In [243]:
DF_P4_S2.isna().sum()

Address        0
member         0
ISCO_s         0
ISIC_s         0
status_s       0
agriculture    0
hours_s        0
days_s         0
Total_cost     0
sale           0
income_s_y     0
Fasl           0
year           0
dtype: int64

In [244]:
path_file = 'R1399_P4_S02'
DF_P4_S2.to_csv(path_file+'.csv', index=False)

### P4_S03

In [245]:
DF_P4_S3 = pd.read_csv('R99P4S03.csv')
DF_P4_S3.head()

Unnamed: 0,Address,member,income_pension,income_rent,income_interest,income_aid,income_resale,income_transfer,Fasl,year,DYCOL00
0,20011394629,1.0,,,,10320000.0,,,1,1399,
1,20011394629,2.0,,,900000.0,,,,1,1399,
2,20011394626,1.0,,,,11410000.0,,,1,1399,
3,20011394626,2.0,,,1090000.0,,,,1,1399,
4,20011394623,1.0,,,,12950000.0,,,1,1399,


In [246]:
DF_P4_S3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23990 entries, 0 to 23989
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          23990 non-null  int64  
 1   member           23988 non-null  float64
 2   income_pension   7748 non-null   object 
 3   income_rent      6585 non-null   object 
 4   income_interest  15951 non-null  float64
 5   income_aid       18924 non-null  float64
 6   income_resale    6101 non-null   float64
 7   income_transfer  9128 non-null   float64
 8   Fasl             23990 non-null  int64  
 9   year             23990 non-null  int64  
 10  DYCOL00          0 non-null      float64
dtypes: float64(6), int64(3), object(2)
memory usage: 2.0+ MB


In [247]:
DF_P4_S3.drop(['DYCOL00'], axis = 1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های income_pension , income_rent به float تغییر می‌دهیم.

In [248]:
DF_P4_S3.income_pension = DF_P4_S3.income_pension.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_pension.isin(['']) , 'income_pension' ] = np.nan
DF_P4_S3.income_pension = DF_P4_S3.income_pension.astype(pd.Float64Dtype())

DF_P4_S3.income_rent = DF_P4_S3.income_rent.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_rent.isin(['']) , 'income_rent' ] = np.nan
DF_P4_S3.income_rent = DF_P4_S3.income_rent.astype(pd.Float64Dtype())


<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های income جمع می‌شوند و در ستون جدیدی به نام Total_income ذخیره می‌شوند.

In [249]:
DF_P4_S3.insert(2,'Total_income', DF_P4_S3.loc[:,'income_pension':'income_transfer' ].sum(axis=1))
del_columns = ['income_pension','income_rent', 'income_interest','income_aid','income_resale','income_transfer']
DF_P4_S3.drop(del_columns,axis=1, inplace=True)

In [250]:
DF_P4_S3.isna().sum()

Address         0
member          2
Total_income    0
Fasl            0
year            0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
حذف سطر‌های nan

In [251]:
DF_P4_S3.dropna(inplace=True)

In [252]:
DF_P4_S3.isna().sum()

Address         0
member          0
Total_income    0
Fasl            0
year            0
dtype: int64

In [253]:
path_file = 'R1399_P4_S03'
DF_P4_S3.to_csv(path_file+'.csv', index=False)

### P4_S04

In [254]:
DF_P4_S4 = pd.read_csv('R99P4S04.csv')
DF_P4_S4.head()

Unnamed: 0,Address,member,subsidy_number,subsidy_month,subsidy,Fasl,year,DYCOL00
0,20011394629,1,4.0,12.0,21840000.0,1,1399,
1,20011394626,1,4.0,12.0,21840000.0,1,1399,
2,20011394623,1,3.0,12.0,16380000.0,1,1399,
3,20011394614,1,3.0,12.0,16380000.0,1,1399,
4,20012395829,1,2.0,12.0,10920000.0,1,1399,


In [255]:
DF_P4_S4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29088 entries, 0 to 29087
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Address         29088 non-null  int64  
 1   member          29088 non-null  int64  
 2   subsidy_number  29083 non-null  float64
 3   subsidy_month   29087 non-null  float64
 4   subsidy         29086 non-null  float64
 5   Fasl            29088 non-null  int64  
 6   year            29088 non-null  int64  
 7   DYCOL00         0 non-null      float64
dtypes: float64(4), int64(4)
memory usage: 1.8 MB


In [256]:
DF_P4_S4.drop(['DYCOL00'], axis = 1, inplace=True)

In [257]:
DF_P4_S4.isna().sum()

Address           0
member            0
subsidy_number    5
subsidy_month     1
subsidy           2
Fasl              0
year              0
dtype: int64

In [258]:
DF_P4_S4.dropna(inplace=True)

In [259]:
DF_P4_S4.isna().sum()

Address           0
member            0
subsidy_number    0
subsidy_month     0
subsidy           0
Fasl              0
year              0
dtype: int64

In [260]:
path_file = 'R1399_P4_S04'
DF_P4_S4.to_csv(path_file+'.csv', index=False)