# Data inputs and Display Libraries



In [1]:

import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.5f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


# EDA Libraries

# Data Preprocessing Libraries

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder


# Feature Selection & Modelling Libraries

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif
import pickle


In [8]:
!git clone https://github.com/ragamudra/workshop_classification

Cloning into 'workshop_classification'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 8 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (8/8), done.


In [12]:
!unrar x 'workshop_classification/prep_file.rar' 'workshop_classification'


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from workshop_classification/prep_file.rar

Extracting  workshop_classification/prep_file.csv                          3%  6% 10% 13% 17% 20% 24% 27% 30% 34% 37% 41% 44% 48% 51% 54% 58% 61% 65% 68% 72% 75% 78% 82% 85% 89% 92% 96% 99%  OK 
All OK


In [13]:
#t1 = pd.read_csv("workshop_classification/prep_file.csv",sep=",")

t1 = pd.read_csv("./workshop_classification/prep_file.csv")

# Step 1-  Look at the data

In [14]:
t1.head()

Unnamed: 0.1,Unnamed: 0,income,age,experience,bureau_score,married,house_ownership,car_ownership,risk_flag,profession,city,state,current_job_years,current_house_years,device
0,19607,2514921,31.0,4.0,651.0,single,rented,no,0,Psychologist,Chandrapur,Maharashtra,4.0,14.0,Oppo
1,75516,7047674,28.0,4.0,526.0,single,rented,yes,0,Economist,Ramagundam[27],Telangana,3.0,13.0,Xiaomi
2,63804,2749317,30.0,2.0,526.0,single,rented,no,0,Secretary,Ramagundam[27],Telangana,2.0,14.0,samsung
3,63676,7378274,24.0,0.0,764.0,single,rented,no,0,Flight attendant,Adoni,Andhra Pradesh,0.0,11.0,samsung
4,50914,9574585,27.0,5.0,739.0,single,rented,yes,0,Technician,Imphal,Manipur,5.0,10.0,Vivo


In [15]:
t1.columns

Index(['Unnamed: 0', 'income', 'age', 'experience', 'bureau_score', 'married',
       'house_ownership', 'car_ownership', 'risk_flag', 'profession', 'city',
       'state', 'current_job_years', 'current_house_years', 'device'],
      dtype='object')

# Look at data - now to get the target variable distribution

In [None]:
t1["risk_flag"].value_counts()

0    236567
1     43433
Name: risk_flag, dtype: int64

In [16]:
t1["risk_flag"].value_counts(normalize=True)

0   0.84488
1   0.15512
Name: risk_flag, dtype: float64

# Look at data - listing string and numeric columns

In [17]:
#str_col_name_df = pd.read_csv("workshop_classification/str_cols.csv")
str_col_name_df = pd.read_csv("./workshop_classification/str_cols.csv")
str_col_name_df.columns = ["index","col_name"]
str_col_name_list =list(str_col_name_df["col_name"])

#num_col_name_df = pd.read_csv("workshop_classification/num_cols.csv")
num_col_name_df = pd.read_csv("./workshop_classification/num_cols.csv")
num_col_name_df.columns = ["index","col_name"]
num_col_name_df=num_col_name_df.reset_index()
num_col_name_list = list(num_col_name_df["col_name"])

In [18]:
print (str_col_name_list)
print (num_col_name_list)

['profession', 'married', 'house_ownership', 'car_ownership', 'city', 'state']
['income', 'age', 'experience', 'current_job_years', 'current_house_years', 'bureau_score']


In [19]:
t0 = t1.copy()
for i in str_col_name_list:
    t1[i] = t1[i].str.lower().str.lstrip().str.rstrip()
    t1[i] = t1[i].str.replace("[^a-z\s]+","")


  after removing the cwd from sys.path.


# Train test split
### Before we do any preprocessing we want to keep train and test seperate

In [20]:
x_train, x_test, y_train, y_test = train_test_split(t1, t1["risk_flag"], test_size=0.33, random_state=42)

In [21]:
x_train0 = x_train.reset_index()
x_test0 = x_test.reset_index()

In [22]:
x_train0.shape, x_test0.shape

((187600, 16), (92400, 16))

#Label encoding string Variables -- baseline categorical approach

In [23]:


enc = OneHotEncoder(handle_unknown='ignore')
df_one_hot_tr = pd.DataFrame(enc.fit_transform(np.array(x_train0[str_col_name_list])).todense())
df_one_hot_te = pd.DataFrame(enc.transform(np.array(x_test0[str_col_name_list])).todense())
colnames = enc.get_feature_names()

df_one_hot_tr.columns = colnames
df_one_hot_te.columns = colnames



In [24]:
viz1 = df_one_hot_tr.head()
viz1 = viz1.astype(int)
viz1.columns = colnames
viz1

Unnamed: 0,x0_air traffic controller,x0_analyst,x0_architect,x0_army officer,x0_artist,x0_aviator,x0_biomedical engineer,x0_chartered accountant,x0_chef,x0_chemical engineer,x0_civil engineer,x0_civil servant,x0_comedian,x0_computer hardware engineer,x0_computer operator,x0_consultant,x0_dentist,x0_design engineer,x0_designer,x0_drafter,x0_economist,x0_engineer,x0_fashion designer,x0_financial analyst,x0_firefighter,x0_flight attendant,x0_geologist,x0_graphic designer,x0_hotel manager,x0_industrial engineer,x0_lawyer,x0_librarian,x0_magistrate,x0_mechanical engineer,x0_microbiologist,x0_official,x0_petroleum engineer,x0_physician,x0_police officer,x0_politician,...,x4_ulhasnagar,x4_uluberia,x4_unnao,x4_vadodara,x4_varanasi,x4_vasaivirar,x4_vellore,x4_vijayanagaram,x4_vijayawada,x4_visakhapatnam,x4_warangal,x4_yamunanagar,x5_andhra pradesh,x5_assam,x5_bihar,x5_chandigarh,x5_chhattisgarh,x5_delhi,x5_gujarat,x5_haryana,x5_himachal pradesh,x5_jammu and kashmir,x5_jharkhand,x5_karnataka,x5_kerala,x5_madhya pradesh,x5_maharashtra,x5_manipur,x5_mizoram,x5_odisha,x5_puducherry,x5_punjab,x5_rajasthan,x5_sikkim,x5_tamil nadu,x5_telangana,x5_tripura,x5_uttar pradesh,x5_uttarakhand,x5_west bengal
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(x_train0[num_col_name_list])

x_train_num = pd.DataFrame(imp_mean.transform(x_train0[num_col_name_list]))
x_test_num = pd.DataFrame(imp_mean.transform(x_test0[num_col_name_list]))

x_train_num.columns = num_col_name_list
x_test_num.columns = num_col_name_list

SimpleImputer()

# Concatenating Numeric and categorical

In [26]:
df_all_train1 = pd.concat([df_one_hot_tr,x_train_num],axis=1)
df_all_test1 = pd.concat([df_one_hot_te,x_test_num],axis=1)

In [27]:
all_cols = list(colnames) + num_col_name_list

In [None]:
##back to presentation

In [28]:

le = preprocessing.LabelEncoder()
y_train1 = le.fit_transform(y_train)
y_test1 = le.transform(y_test)

:

In [29]:
#Feature selection

selector = SelectPercentile(f_classif, percentile=60)
selector.fit(df_all_train1, y_train1)
df_all_train2 = selector.transform(df_all_train1)
df_all_test2 = selector.transform(df_all_test1)

SelectPercentile(percentile=60)

In [30]:
df_all_train1.shape, df_all_train2.shape

((187600, 408), (187600, 245))

In [31]:
columns_flag = selector.get_support()
#sel_cols = all_cols[columns_flag==True]
sel_cols =[]
for num,i in enumerate(all_cols):
    if (columns_flag[num]==True):
        sel_cols.append(i)

In [32]:
sel_cols

['x0_air traffic controller',
 'x0_analyst',
 'x0_army officer',
 'x0_artist',
 'x0_chef',
 'x0_civil engineer',
 'x0_comedian',
 'x0_computer operator',
 'x0_dentist',
 'x0_design engineer',
 'x0_drafter',
 'x0_economist',
 'x0_firefighter',
 'x0_flight attendant',
 'x0_graphic designer',
 'x0_industrial engineer',
 'x0_lawyer',
 'x0_librarian',
 'x0_magistrate',
 'x0_mechanical engineer',
 'x0_petroleum engineer',
 'x0_physician',
 'x0_police officer',
 'x0_politician',
 'x0_psychologist',
 'x0_scientist',
 'x0_secretary',
 'x0_surveyor',
 'x0_technical writer',
 'x0_technician',
 'x0_web designer',
 'x1_married',
 'x1_single',
 'x2_norentnoown',
 'x2_owned',
 'x2_rented',
 'x3_no',
 'x3_yes',
 'x4_agartala',
 'x4_agra',
 'x4_aizawl',
 'x4_akola',
 'x4_alappuzha',
 'x4_allahabad',
 'x4_alwar',
 'x4_ambala',
 'x4_amravati',
 'x4_anantapuram',
 'x4_arrah',
 'x4_asansol',
 'x4_aurangabad',
 'x4_avadi',
 'x4_ballia',
 'x4_barasat',
 'x4_bathinda',
 'x4_begusarai',
 'x4_belgaum',
 'x4_bel

In [None]:
###to understand not to run

In [None]:
fl_out = "./datasets"
def pik_now(ob_name):
    fl_out1 = fl_out + "/" + ob_name
    pickling_on = open(fl_out1,"wb")
    pickle.dump(eval(ob_name), pickling_on)
    pickling_on.close()
    
    return

In [None]:
list_objs = ["df_all_train2","y_train1","df_all_test2","y_test1"]

for i in list_objs:
     pik_now(i)