In [96]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import statistics
#import missingno as msno

In [97]:
path_file = 'U99.xlsx'
sheets_excel = pd.read_excel('U99.xlsx',sheet_name=None)

In [98]:
for name in list(sheets_excel.keys()):
    sheets_excel[name].to_csv(name+'.csv', index=False)

### P4_S01

In [99]:
DF_P4_S1 = pd.read_csv('U99P4S01.csv')

In [100]:
DF_P4_S1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12186 entries, 0 to 12185
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Address        12186 non-null  int64  
 1   member         12186 non-null  int64  
 2   employed_w     12186 non-null  int64  
 3   ISCO_w         12186 non-null  int64  
 4   ISIC_w         12186 non-null  int64  
 5   status_w       12186 non-null  int64  
 6   hours_w        11381 non-null  float64
 7   days_w         11383 non-null  float64
 8   income_w_m     12186 non-null  int64  
 9   income_w_y     12186 non-null  int64  
 10  wage_w_m       12180 non-null  float64
 11  wage_w_y       12186 non-null  int64  
 12  perk_w_m       12183 non-null  float64
 13  perk_w_y       12186 non-null  int64  
 14  netincome_w_m  12186 non-null  int64  
 15  netincome_w_y  12186 non-null  int64  
 16  Fasl           12186 non-null  int64  
 17  year           12186 non-null  int64  
 18  DYCOL0

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون DYcol00 کامل nan است.

In [101]:
DF_P4_S1.drop(['DYCOL00'], axis=1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
همبستگی ستون‌های income_w , wage_w, netincome_w مورد برسی قرار گرفت و چون همستگی ستون income در سال‌های دیگه کم بود فقط ستون wage_w حذف میشود.

In [102]:
DF_P4_S1[['income_w_m', 'wage_w_m', 'netincome_w_m']].corr()

Unnamed: 0,income_w_m,wage_w_m,netincome_w_m
income_w_m,1.0,0.792197,0.80461
wage_w_m,0.792197,1.0,0.983115
netincome_w_m,0.80461,0.983115,1.0


In [103]:
DF_P4_S1[['income_w_y', 'wage_w_y', 'netincome_w_y']].corr()

Unnamed: 0,income_w_y,wage_w_y,netincome_w_y
income_w_y,1.0,0.784004,0.797084
wage_w_y,0.784004,1.0,0.991536
netincome_w_y,0.797084,0.991536,1.0


In [104]:
DF_P4_S1.drop(['wage_w_m'], axis=1 , inplace= True)
DF_P4_S1.drop(['wage_w_y'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
با توجه به اطلاعات زیر ستون‌های employed_w, status_w, perk_w_m دارای imblance هستند بنابراین حذف می‌کنیم.

In [105]:
for col in DF_P4_S1.columns:
 print(f'{col :>15} {(DF_P4_S1[col].value_counts()/len(DF_P4_S1)).max():f}')

        Address 0.000492
         member 0.666913
     employed_w 0.916462
         ISCO_w 0.178319
         ISIC_w 0.182505
       status_w 0.769408
        hours_w 0.505170
         days_w 0.506975
     income_w_m 0.093058
     income_w_y 0.039800
       perk_w_m 0.930822
       perk_w_y 0.579025
  netincome_w_m 0.093304
  netincome_w_y 0.039307
           Fasl 0.253816
           year 1.000000


In [106]:
DF_P4_S1.drop(['employed_w', 'status_w',  'perk_w_m'], inplace=True, axis=1)

In [107]:
DF_P4_S1.isna().sum()

Address            0
member             0
ISCO_w             0
ISIC_w             0
hours_w          805
days_w           803
income_w_m         0
income_w_y         0
perk_w_y           0
netincome_w_m      0
netincome_w_y      0
Fasl               0
year               0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_w , hours_w با گروه بندی روی ISCO_w, ISIC_w پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [108]:
temp = DF_P4_S1.groupby(['ISCO_w', 'ISIC_w' ])[['hours_w','days_w']].transform('median')
#temp = np.floor(temp).astype(pd.Float64Dtype())
DF_P4_S1.loc[DF_P4_S1.hours_w.isna(), 'hours_w'] = temp.loc[DF_P4_S1.hours_w.isna(), 'hours_w']
DF_P4_S1.loc[DF_P4_S1.days_w.isna(), 'days_w'] = temp.loc[DF_P4_S1.days_w.isna(), 'days_w']

In [109]:
DF_P4_S1.isna().sum()

Address            0
member             0
ISCO_w             0
ISIC_w             0
hours_w          102
days_w           103
income_w_m         0
income_w_y         0
perk_w_y           0
netincome_w_m      0
netincome_w_y      0
Fasl               0
year               0
dtype: int64

In [110]:
DF_P4_S1.hours_w.fillna(DF_P4_S1.hours_w.median(),inplace= True)
DF_P4_S1.days_w.fillna(DF_P4_S1.days_w.median(), inplace= True)

In [111]:
DF_P4_S1.isna().sum()

Address          0
member           0
ISCO_w           0
ISIC_w           0
hours_w          0
days_w           0
income_w_m       0
income_w_y       0
perk_w_y         0
netincome_w_m    0
netincome_w_y    0
Fasl             0
year             0
dtype: int64

In [112]:
path_file = 'U1399_P4_S01'
DF_P4_S1.to_csv(path_file+'.csv', index=False)

### P4_S02

In [32]:
DF_P4_S2 = pd.read_csv('U99P4S02.csv')
DF_P4_S2.head()

Unnamed: 0,Address,member,employed_s,ISCO_s,ISIC_s,status_s,agriculture,hours_s,days_s,cost_employment,cost_raw,cost_machinery,cost_others,cost_tax,sale,income_s_y,Fasl,year,DYCOL00
0,10009008518,1,1,6121.0,1440.0,5,1,6.0,7.0,24000000.0,200000000.0,,9000000.0,,300000000.0,67000000,1,1399,
1,10011009714,1,1,7412.0,95220.0,5,2,10.0,6.0,,100000000.0,20000000.0,35000000.0,,335000000.0,180000000,1,1399,
2,10005004618,1,1,7233.0,33120.0,5,2,7.0,7.0,,120000000.0,30000000.0,90000000.0,5000000.0,445000000.0,200000000,1,1399,
3,10006005924,1,1,7517.0,10711.0,4,2,10.0,7.0,18000000.0,160000000.0,2000000.0,10000000.0,,430000000.0,240000000,1,1399,
4,10009008517,1,1,8322.0,49230.0,5,2,7.0,6.0,,35000000.0,22000000.0,20000000.0,3000000.0,190000000.0,110000000,1,1399,


In [33]:
DF_P4_S2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6377 entries, 0 to 6376
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          6377 non-null   int64  
 1   member           6377 non-null   int64  
 2   employed_s       6377 non-null   int64  
 3   ISCO_s           6376 non-null   float64
 4   ISIC_s           6375 non-null   float64
 5   status_s         6377 non-null   int64  
 6   agriculture      6377 non-null   int64  
 7   hours_s          6252 non-null   float64
 8   days_s           6253 non-null   float64
 9   cost_employment  5477 non-null   float64
 10  cost_raw         6125 non-null   float64
 11  cost_machinery   5890 non-null   float64
 12  cost_others      6052 non-null   float64
 13  cost_tax         5454 non-null   float64
 14  sale             6327 non-null   float64
 15  income_s_y       6377 non-null   int64  
 16  Fasl             6377 non-null   int64  
 17  year          

In [34]:
DF_P4_S2.drop(['DYCOL00'], axis=1 , inplace= True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های cost جمع می‌شوند و در ستون جدیدی به نام Total_cost ذخیره می‌شوند.

In [35]:
del_columns = ['cost_employment', 'cost_raw', 'cost_machinery', 'cost_others', 'cost_tax']
DF_P4_S2.insert(9,'Total_cost', DF_P4_S2.loc[:,'cost_employment':'cost_tax' ].sum(axis=1))
DF_P4_S2.drop(del_columns, axis=1, inplace=True)

In [36]:
DF_P4_S2.isna().sum()

Address          0
member           0
employed_s       0
ISCO_s           1
ISIC_s           2
status_s         0
agriculture      0
hours_s        125
days_s         124
Total_cost       0
sale            50
income_s_y       0
Fasl             0
year             0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های day_s , hours_s با گروه بندی روی ISCO_s, ISIC_s پر می‌کنیم و ما بقی را با میانه کل داده‌ها پر می‌کنیم.

In [37]:
temp = DF_P4_S2.groupby(['ISCO_s', 'ISIC_s' ])[['hours_s','days_s']].transform('median')

DF_P4_S2.loc[DF_P4_S2.hours_s.isna(), 'hours_s'] = temp.loc[DF_P4_S2.hours_s.isna(), 'hours_s']
DF_P4_S2.loc[DF_P4_S2.days_s.isna(), 'days_s'] = temp.loc[DF_P4_S2.days_s.isna(), 'days_s']

DF_P4_S2.hours_s.fillna(DF_P4_S2.hours_s.median(),inplace= True)
DF_P4_S2.days_s.fillna(DF_P4_S2.days_s.median(), inplace= True)

In [38]:
DF_P4_S2.isna().sum()

Address         0
member          0
employed_s      0
ISCO_s          1
ISIC_s          2
status_s        0
agriculture     0
hours_s         0
days_s          0
Total_cost      0
sale           50
income_s_y      0
Fasl            0
year            0
dtype: int64

In [39]:
DF_P4_S2.dropna(subset=['ISCO_s', 'ISIC_s' ],inplace=True)

In [40]:
DF_P4_S2.isna().sum()

Address         0
member          0
employed_s      0
ISCO_s          0
ISIC_s          0
status_s        0
agriculture     0
hours_s         0
days_s          0
Total_cost      0
sale           50
income_s_y      0
Fasl            0
year            0
dtype: int64

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون sale
همان طور که در کد زیر نشان داده شده است تقریبن در 100 درصد موارد nan در ستون income_s_y مقدار0 دارد.

In [41]:
DF_P4_S2.loc[DF_P4_S2.sale.isna()].income_s_y.value_counts() / DF_P4_S2.sale.isna().sum() * 100

income_s_y
0            98.0
170000000     2.0
Name: count, dtype: float64

In [42]:
DF_P4_S2.loc[DF_P4_S2.sale.isna()].status_s.value_counts()

status_s
6    49
5     1
Name: count, dtype: int64

In [44]:
DF_P4_S2.fillna(0, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون employed_s به دلیل imblance حذف می‌کنیم.

In [45]:
for col in DF_P4_S2.columns:
 print(f'{col :>15} {(DF_P4_S2[col].value_counts()/len(DF_P4_S2)).max():f}')

        Address 0.001098
         member 0.788706
     employed_s 0.968471
         ISCO_s 0.192784
         ISIC_s 0.117333
       status_s 0.838275
    agriculture 0.780549
        hours_s 0.280314
         days_s 0.459922
     Total_cost 0.180706
           sale 0.066824
     income_s_y 0.061020
           Fasl 0.253647
           year 1.000000


In [46]:
DF_P4_S2.drop(['employed_s'],axis=1, inplace=True)

In [47]:
DF_P4_S2.isna().sum()

Address        0
member         0
ISCO_s         0
ISIC_s         0
status_s       0
agriculture    0
hours_s        0
days_s         0
Total_cost     0
sale           0
income_s_y     0
Fasl           0
year           0
dtype: int64

In [48]:
path_file = 'U1399_P4_S02'
DF_P4_S2.to_csv(path_file+'.csv', index=False)

### P4_S03

In [74]:
DF_P4_S3 = pd.read_csv('U99P4S03.csv')
DF_P4_S3.head()

Unnamed: 0,Address,member,income_pension,income_rent,income_interest,income_aid,income_resale,income_transfer,Fasl,year,DYCOL00
0,10009008518,1,,120000000.0,,10000000.0,,40000000.0,1,1399,
1,10005004626,1,216000000.0,,,16900000.0,,,1,1399,
2,10006005929,1,,,1600000.0,16900000.0,,,1,1399,
3,10011009720,1,,,6300000.0,10000000.0,,,1,1399,
4,10005004618,1,,,,10000000.0,,,1,1399,


In [75]:
DF_P4_S3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23632 entries, 0 to 23631
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          23632 non-null  int64  
 1   member           23632 non-null  int64  
 2   income_pension   10601 non-null  object 
 3   income_rent      7516 non-null   object 
 4   income_interest  14115 non-null  object 
 5   income_aid       18249 non-null  float64
 6   income_resale    6860 non-null   object 
 7   income_transfer  9475 non-null   object 
 8   Fasl             23632 non-null  int64  
 9   year             23632 non-null  int64  
 10  DYCOL00          0 non-null      float64
dtypes: float64(2), int64(4), object(5)
memory usage: 2.0+ MB


In [76]:
DF_P4_S3.drop(['DYCOL00'], axis = 1, inplace=True)

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های income_pension , income_rent به float تغییر می‌دهیم.

In [77]:
DF_P4_S3.income_pension = DF_P4_S3.income_pension.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_pension.isin(['']) , 'income_pension' ] = np.nan
DF_P4_S3.income_pension = DF_P4_S3.income_pension.astype(pd.Float64Dtype())

DF_P4_S3.income_rent = DF_P4_S3.income_rent.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_rent.isin(['']) , 'income_rent' ] = np.nan
DF_P4_S3.income_rent = DF_P4_S3.income_rent.astype(pd.Float64Dtype())

DF_P4_S3.income_interest = DF_P4_S3.income_interest.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_interest.isin(['']) , 'income_interest' ] = np.nan
DF_P4_S3.income_interest = DF_P4_S3.income_interest.astype(pd.Float64Dtype())

DF_P4_S3.income_resale = DF_P4_S3.income_resale.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_resale.isin(['']) , 'income_resale' ] = np.nan
DF_P4_S3.income_resale = DF_P4_S3.income_resale.astype(pd.Float64Dtype())

DF_P4_S3.income_interest = DF_P4_S3.income_interest.str.strip()
DF_P4_S3.loc[DF_P4_S3.income_interest.isin(['']) , 'income_interest' ] = np.nan
DF_P4_S3.income_interest = DF_P4_S3.income_interest.astype(pd.Float64Dtype())


In [79]:
DF_P4_S3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23632 entries, 0 to 23631
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Address          23632 non-null  int64  
 1   member           23632 non-null  int64  
 2   income_pension   10600 non-null  Float64
 3   income_rent      7515 non-null   Float64
 4   income_interest  14114 non-null  Float64
 5   income_aid       18249 non-null  float64
 6   income_resale    6860 non-null   object 
 7   income_transfer  9475 non-null   object 
 8   Fasl             23632 non-null  int64  
 9   year             23632 non-null  int64  
dtypes: Float64(3), float64(1), int64(4), object(2)
memory usage: 1.9+ MB


<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
ستون‌های income جمع می‌شوند و در ستون جدیدی به نام Total_income ذخیره می‌شوند.

In [78]:
DF_P4_S3.insert(2,'Total_income', DF_P4_S3.loc[:,'income_pension':'income_transfer' ].sum(axis=1))
del_columns = ['income_pension','income_rent', 'income_interest','income_aid','income_resale','income_transfer']
DF_P4_S3.drop(del_columns,axis=1, inplace=True)

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [None]:
DF_P4_S3.isna().sum()

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=5 color= orange>
حذف سطر‌های nan

In [None]:
DF_P4_S3.dropna(inplace=True)

In [None]:
DF_P4_S3.isna().sum()

In [None]:
path_file = 'R1399_P4_S03'
DF_P4_S3.to_csv(path_file+'.csv', index=False)

### P4_S04

In [None]:
DF_P4_S4 = pd.read_csv('R99P4S04.csv')
DF_P4_S4.head()

In [None]:
DF_P4_S4.info()

In [None]:
DF_P4_S4.drop(['DYCOL00'], axis = 1, inplace=True)

In [None]:
DF_P4_S4.isna().sum()

In [None]:
DF_P4_S4.dropna(inplace=True)

In [None]:
DF_P4_S4.isna().sum()

In [None]:
path_file = 'R1399_P4_S04'
DF_P4_S4.to_csv(path_file+'.csv', index=False)