In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

files = []
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))
        
print(files)

In [None]:
df = pd.read_csv(files[2])
df.head()

In [None]:
df.shape

In [None]:
print(f'{("Column Name").rjust(25)}   {"#NaNs": <10} {"#Unique Values"}')
for column in df.columns:
    uv = df[column].unique()
    if len(uv)>10: uv=""
        
    print(f'{(column).rjust(25)}   {str(df[column].isnull().sum()): <10} {len(df[column].unique())}\t\t{str(uv): <10}')

In [None]:
import matplotlib.pyplot as plt

def how_many_in_cat_plot(df, variable, color, a, i):
    ax = plt.subplot(5, 3, (i+1))
    ax.tick_params(labelrotation=15)
    #df[variable].hist(ax = ax)
    plt.hist(df[variable], facecolor = color, alpha=a)
    plt.title(variable)
    

In [None]:
df.columns

In [None]:
colors = ['red', 'blue', 'orange', 'green', 'magenta', 'pink', 'black', 'grey']
fig = plt.figure(figsize = (30,30))
temp_df = df[df.columns].copy(deep=True)

for i, feature in enumerate(['city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target']):
    
    # Replace NAN with Missing
    temp_df[feature] = df[feature].fillna('Missing')
    how_many_in_cat_plot(temp_df, feature, colors[i%len(colors)], 0.5, i)
    
plt.show()

In [None]:
for feature in ['city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours']:
    
    print(df.groupby(feature)['target'].value_counts())
    print(df.groupby(feature)['target'].mean().sort_values(ascending=False))

In [None]:
df.head()

In [None]:
print(df.groupby('city')['target'].value_counts()['city_171'])

### Creating a Final DataFrame

In [None]:
df_final = pd.DataFrame()

#### To add:
- ~~'city'~~ >> dropped
- ~~'city_development_index'~~
- ~~'gender'~~
- ~~'relevent_experience'~~
- ~~'enrolled_university'~~
- ~~'education_level'~~
- ~~'major_discipline'~~
- ~~'experience'~~
- 'company_size'
- 'company_type'
- ~~'last_new_job'~~
- ~~'training_hours'~~ >> scaled, transformed

### City Development Index

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat, pylab

feature = 'city_development_index'
fig = plt.figure(figsize=(18,4))
plt.subplot(131)
ax = sns.distplot(df[feature]) #, fit=stat.norm)
plt.title('Before Scaling')
plt.subplot(132)
df[feature].hist()
plt.subplot(133)
stat.probplot(df[feature], dist='norm', plot=pylab)
plt.show()

In [None]:
temp_df = pd.DataFrame()

In [None]:
# Box Cox T
feature = 'cdi_boxcox'
temp_df[feature], parameter = stat.boxcox(df['city_development_index'])
fig = plt.figure(figsize=(18,4))
plt.subplot(131)
ax = sns.distplot(temp_df[feature], fit=stat.norm)
plt.title('After Transformation')
plt.subplot(132)
temp_df[feature].hist()
plt.subplot(133)
stat.probplot(temp_df[feature], dist='norm', plot=pylab)
plt.show()

In [None]:
print(f'Box Cox Parameter: {parameter}')
temp_df.head()

In [None]:
# 1/Log T
feature = 'cdi_ExpOfReci'
temp_df[feature] = (1/(df['city_development_index'])**(0.01))
fig = plt.figure(figsize=(18,4))
plt.subplot(131)
ax = sns.distplot(temp_df[feature], fit=stat.norm)
plt.title('After Transformation')
plt.subplot(132)
temp_df[feature].hist()
plt.subplot(133)
stat.probplot(temp_df[feature], dist='norm', plot=pylab)
plt.show()

In [None]:
temp_df.head()

In [None]:
df_final['city_development_index'] = df['city_development_index']
df_final.head()

### Relevent Experience

In [None]:
df_final['relevent_experience'] = np.where(df['relevent_experience']=='Has relevent experience', 1, 0)
df_final.head()

### Experience

In [None]:
df['experience'].value_counts().sort_values(ascending=False)
df.loc[df['experience']== '>20' , 'experience'] = 21
df.loc[df['experience']== '<1' , 'experience'] = 0.5

df_final['experience'] = df['experience']
df_final.head()

### Last New Job

In [None]:
df.loc[df['last_new_job']== '>4' , 'last_new_job'] = 5
df.loc[df['last_new_job']== 'never' , 'last_new_job'] = 0
df.head(10)

In [None]:
df['last_new_job'].fillna(df['last_new_job'].mode()[0],inplace=True)
df['last_new_job'] = df['last_new_job'].astype(int)

In [None]:
df_final['last_new_job'] = df['last_new_job']
df_final.head()

### Training Hours

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stat, pylab

feature = 'training_hours'
fig = plt.figure(figsize=(18,4))
plt.subplot(131)
ax = sns.distplot(df[feature]) #, fit=stat.norm)
plt.title('Before Scaling')
plt.subplot(132)
df[feature].hist()
plt.subplot(133)
stat.probplot(df[feature], dist='norm', plot=pylab)
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['training_hours']])
df_scaled = pd.DataFrame(df_scaled, columns=['training_hours_scaled'])
df_scaled

In [None]:
feature = 'training_hours_scaled'
fig = plt.figure(figsize=(18,4))
plt.subplot(131)
ax = sns.distplot(df_scaled[feature]) #, fit=stat.norm)
plt.title('After Scaling')
plt.subplot(132)
df_scaled[feature].hist()
plt.subplot(133)
stat.probplot(df_scaled[feature], dist='norm', plot=pylab)
plt.show()

In [None]:
# Logarithmic T
df_scaled['training_hours_log'] = np.log(df['training_hours'])
feature = 'training_hours_log'
fig = plt.figure(figsize=(18,4))
plt.subplot(131)
ax = sns.distplot(df_scaled[feature], fit=stat.norm)
plt.title('After Transformation')
plt.subplot(132)
df_scaled[feature].hist()
plt.subplot(133)
stat.probplot(df_scaled[feature], dist='norm', plot=pylab)
plt.show()

In [None]:
# Exponential T
feature = 'training_hours_exp'
df_scaled[feature] = df['training_hours']**(1/5)
fig = plt.figure(figsize=(18,4))
plt.subplot(131)
ax = sns.distplot(df_scaled[feature], fit=stat.norm)
plt.title('After Transformation')
plt.subplot(132)
df_scaled[feature].hist()
plt.subplot(133)
stat.probplot(df_scaled[feature], dist='norm', plot=pylab)
plt.show()

In [None]:
# Box Cox T
feature = 'training_hours_boxcox'
df_scaled[feature], parameter = stat.boxcox(df['training_hours'])
fig = plt.figure(figsize=(18,4))
plt.subplot(131)
ax = sns.distplot(df_scaled[feature], fit=stat.norm)
plt.title('After Transformation')
plt.subplot(132)
df_scaled[feature].hist()
plt.subplot(133)
stat.probplot(df_scaled[feature], dist='norm', plot=pylab)
plt.show()

In [None]:
print(f'Box Cox Parameter: {parameter}')
df_scaled.head()

#### From the distribution curves and Q-Q plots, it is evident that Box Cox Transformation gives the closest to Normal Distribution, second being the Exponential Transformation with exponent=0.2

#### Let's see Standard Scaling on Box Cox Transformaed data

In [None]:
# Box Cox T on Std Sacled
feature = 'train_hrs_scalingOnboxcox'

scaler = StandardScaler()
temp_df_scaled = scaler.fit_transform(df_scaled[['training_hours_boxcox']])
df_scaled[feature] = pd.DataFrame(temp_df_scaled, columns=[feature])[feature]

fig = plt.figure(figsize=(18,4))
plt.subplot(131)
ax = sns.distplot(df_scaled[feature], fit=stat.norm)
plt.title('After Transformation')
plt.subplot(132)
df_scaled[feature].hist()
plt.subplot(133)
stat.probplot(df_scaled[feature], dist='norm', plot=pylab)
plt.show()

#### Thus, we notice that on performing Standard Scaling on Box Cox Transformed data, the distribution remains the same while it is linearly shifted so that the mean = 0.

In [None]:
df_scaled.head()

#### Depending on the model, we might need the scaled and transformed data or we might not.

In [None]:
df_final['training_hours'] = df['training_hours'] # df_scaled['train_hrs_scalingOnboxcox']
df_final.head()

### Enrolled University

In [None]:
df['enrolled_university'].value_counts()

In [None]:
df_final['enrolled_university'] = df['enrolled_university'].fillna(df['enrolled_university'].mode()[0])
df_final.head()

In [None]:
d = {'no_enrollment':0, 'Part time course':1,'Full time course':2}
df_final['enrolled_university'] = df_final['enrolled_university'].map(d)
df_final.head()

### Education Level

In [None]:
print(df['education_level'].isnull().sum())
df['education_level'].value_counts()

In [None]:
d = {'Primary School': 1, 'High School': 2, 'Graduate': 3,'Masters': 4, 'Phd': 5}
df_final['education_level'] = df['education_level']
df_final['education_level'].fillna(df['education_level'].mode()[0], inplace=True)
df_final['education_level'] = df_final['education_level'].map(d)
df_final.head()

In [None]:
df[['city','target']].groupby('city').count().sort_values(['target'], ascending=False).head(10)

### Major Discipline

In [None]:
df['major_discipline'].isnull().mean()

In [None]:
## Replacing the NaN values with 'Missing' label
df['major_discipline'].fillna('Missing',inplace=True)
df['major_discipline'].value_counts()

In [None]:
df.groupby(['major_discipline'])['target'].mean()

In [None]:
#Target Guided Ordinal Encoding

ordinal_labels = df.groupby(['major_discipline'])['target'].mean().sort_values().index
ordinal_labels_dict={k:i for i,k in enumerate(ordinal_labels,0)}
ordinal_labels_dict

In [None]:
df_final['major_discipline'] = df['major_discipline'].map(ordinal_labels_dict)
df_final.head()

### Gender

In [None]:
df['gender'].value_counts().plot(kind='barh')

In [None]:
df.groupby(['gender'])['target'].value_counts()

Even distribute Male/Female/Other wherever Gender is missing.

In [None]:
import random
df_final['gender'] = df['gender'].fillna('Missing')
for i in range(0,len(df_final['gender'])):
    if df_final['gender'].iloc[i] == 'Missing': 
        df_final['gender'].iloc[i] = random.choice(['Male','Female','Other'])
        
df_final.head()

In [None]:
df_final['target'] = df['target']
df_final.groupby(['gender'])['target'].value_counts()

One-Hot Encoding Gender

In [None]:
df_gender = pd.get_dummies(df_final,drop_first=True)
df_final['gender_Male'] = df_gender['gender_Male']
df_final['gender_Other'] = df_gender['gender_Other']
df_final.drop('gender',inplace=True,axis=1)
df_final

### Company Size and Company Type

In [None]:
df_final['company_size'] = df['company_size']
df_final['company_type'] = df['company_type']

In [None]:
df_final['company_type'].value_counts().plot(kind='barh')

In [None]:
df_final['company_size'].value_counts()

In [None]:
## Checking how many NaN od company_size and company_type are overlapping

df_final['company_size'].fillna('Missing',inplace=True)
df_final['company_type'].fillna('missing',inplace=True)
df_final.groupby(['company_size'])['company_type'].value_counts()

In [None]:
## Checking how the missing company sizes are distributed among the company types
df.groupby(['company_type'])['company_size'].value_counts()

In [None]:
# Replacing that missing value of a company_type with the mode of company_type

for i in range(0,len(df_final)):
    if (df_final['company_size'].iloc[i] == 'Missing') and (df_final['company_type'].iloc[i] == 'Early Stage Startup'):
        df_final['company_size'].iloc[i] = '<10'
        
    if (df_final['company_size'].iloc[i] == 'Missing') and (df_final['company_type'].iloc[i] == 'Funded Startup'):
        df_final['company_size'].iloc[i] = '50-99'
        
    if (df_final['company_size'].iloc[i] == 'Missing') and (df_final['company_type'].iloc[i] == 'NGO'):
        df_final['company_size'].iloc[i] = '100-500'
    
    if (df_final['company_size'].iloc[i] == 'Missing') and (df_final['company_type'].iloc[i] == 'Other'):
        df_final['company_size'].iloc[i] = '100-500'
    
    if (df_final['company_size'].iloc[i] == 'Missing') and (df_final['company_type'].iloc[i] == 'Public Sector'):
        df_final['company_size'].iloc[i] = '1000-4999'
        
    if (df_final['company_size'].iloc[i] == 'Missing') and (df_final['company_type'].iloc[i] == 'Pvt Ltd'):
        df_final['company_size'].iloc[i] = '50-99'    
        

In [None]:
df_final

In [None]:
## Checking how many NaN of company_size and company_type are overlapping

df_final.groupby(['company_size'])['company_type'].value_counts()

In [None]:
df_final[(df_final['company_type']=='missing') & (df_final['company_size']!='Missing')]

In [None]:
## Checking how many NaN of company_size and company_type are overlapping

df_final.groupby(['company_size'])['company_type'].value_counts()

In [None]:
# Replacing that missing value of a company_type with the mode of company_type

for i in range(0,len(df_final)):
    if ((df_final['company_type'].iloc[i] == 'missing') & (df_final['company_size'].iloc[i] != 'Missing')):
        df_final['company_type'].iloc[i] = 'Pvt Ltd'    
        

In [None]:
df_final.groupby(['company_size'])['company_type'].value_counts()

In [None]:
df_final[(df_final['company_type']=='missing') & (df_final['company_size']!='Missing')]