In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [55]:
df = pd.read_csv('data/aug_train.csv')
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [None]:
df.info()

In [None]:
df.shape

In [56]:
df.drop('enrollee_id', axis =1, inplace=True)
df.shape

(19158, 13)

#### Percentage of null values

In [None]:
df.isnull().sum()/df.shape[0]*100

# Univariate analysis

## Categorical variables

In [None]:
def bar_plot(variable):
    # get feature
    var = df[variable]
    # count number of categorical variable(value/sample)
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize = (9,3))
    plt.bar(varValue.index, varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}:\n{}".format(variable,varValue))

In [None]:
categorical = (df.dtypes == "object")
categorical_list = list(categorical[categorical].index)

print("Categorical variables:")
print(categorical_list)

In [None]:
import seaborn as sns
sns.set_style('darkgrid')
categorical_variables = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']
for c in categorical_variables:
    bar_plot(c)

## Numerical variables

In [None]:
df.info()

In [None]:
numerical_int64 = (df.dtypes == "int64")
numerical_int64_list = list(numerical_int64[numerical_int64].index)

print("Numerical:")
print(numerical_int64_list)

In [None]:
def plot_hist(variable):
    plt.figure(figsize = (9,3))
    plt.hist(df[variable], bins = 50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist".format(variable))
    plt.show()

In [None]:
numerical_variables = ['training_hours']
for n in numerical_variables:
    plot_hist(n)

In [None]:
numerical_float64 = (df.dtypes == "float64")
numerical_float64_list = list(numerical_float64[numerical_float64].index)

print("Numerical variables:")
print(numerical_float64_list)

In [None]:
def plot_hist(variable):
    plt.figure(figsize = (9,3))
    plt.hist(df[variable], bins = 50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} Distribution with Histogram".format(variable))
    plt.show()

In [None]:
numerical_variables = ['city_development_index', 'target']
for n in numerical_variables:
    plot_hist(n)

# Multivariate Analysis

In [None]:
categorical = (df.dtypes == "object")
categorical_list = list(categorical[categorical].index)

print("Categorical variables:")
print(categorical_list)

In [None]:
def x_vs_target(variable):
    return df[[variable,"target"]].groupby([variable], as_index = False).mean().sort_values(by="target",ascending = False)

In [None]:
variables = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']
for i in variables:
    print(f"\033[1m{i} vs target\033[0m \n")
    print(f"Unique {i} count")
    print(df[i].value_counts())
    print()
    print(x_vs_target(i))
    print("==================================\n\n\n")

## Correlation between numeric values

In [None]:
plt.figure(figsize=(12,8)) 
sns.heatmap(df.corr(), annot=True, cmap='Dark2_r', linewidths = 2)
plt.show()

# Preprocessing Data

### Filling null value

In [None]:
df.info()

In [57]:
df.drop('gender', axis =1, inplace=True)
df.shape

(19158, 12)

In [None]:
df.info()

In [None]:
df.isnull().sum()/df.shape[0]*100

In [None]:
df.company_size.value_counts()

In [58]:
df.drop('company_size', axis =1, inplace=True)
df.shape

(19158, 11)

In [None]:
df.isnull().sum()/df.shape[0]*100

In [None]:
19158*0.32

In [59]:
df['company_type'].fillna(df['company_type'].mode()[0], inplace=True)

In [None]:
df.shape

In [None]:
df.isnull().sum()/df.shape[0]*100

In [60]:
df['major_discipline'].fillna(df['major_discipline'].mode()[0], inplace=True)
df.isnull().sum()/df.shape[0]*100

city                      0.000000
city_development_index    0.000000
relevent_experience       0.000000
enrolled_university       2.014824
education_level           2.401086
major_discipline          0.000000
experience                0.339284
company_type              0.000000
last_new_job              2.207955
training_hours            0.000000
target                    0.000000
dtype: float64

In [None]:
df.shape

In [61]:
df['enrolled_university'].fillna(df['enrolled_university'].mode()[0], inplace=True)
df['education_level'].fillna(df['education_level'].mode()[0], inplace=True)
df['experience'].fillna(df['experience'].mode()[0], inplace=True)
df['last_new_job'].fillna(df['last_new_job'].mode()[0], inplace=True)
df.isnull().sum()/df.shape[0]*100

city                      0.0
city_development_index    0.0
relevent_experience       0.0
enrolled_university       0.0
education_level           0.0
major_discipline          0.0
experience                0.0
company_type              0.0
last_new_job              0.0
training_hours            0.0
target                    0.0
dtype: float64

In [None]:
df.shape

In [None]:
df.head().T

In [62]:
categorical = (df.dtypes == "object")
categorical_list = list(categorical[categorical].index)

print("Categorical variables:")
print(categorical_list)

Categorical variables:
['city', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_type', 'last_new_job']


In [63]:
total = 0
for i in categorical_list:
    count = df[i].value_counts().shape[0]
    total = total+count
print("Total categorical variables with each unique value ", total)

Total categorical variables with each unique value  173


In [64]:
total = 0
for i in categorical_list:
    if i!='city':
        count = df[i].value_counts().shape[0]
        total = total+count
print("Total categorical variables with each unique value ", total)

Total categorical variables with each unique value  50


#### Let's deal with city | Handling dimentionality problems

In [65]:
cities = df.groupby('city')['city'].agg('count').sort_values(ascending=False)
cities.head(20)

city
city_103    4355
city_21     2702
city_16     1533
city_114    1336
city_160     845
city_136     586
city_67      431
city_75      305
city_102     304
city_104     301
city_73      280
city_100     275
city_71      266
city_11      247
city_90      197
city_61      197
city_28      192
city_23      182
city_65      175
city_36      160
Name: city, dtype: int64

In [None]:
len(cities[cities <= 20])

**If there are location which count less than 20, make them other category**

In [66]:
cities_20 = cities[cities <= 10]
cities_20 

city
city_127    10
city_59     10
city_167    10
city_133    10
city_109     9
city_106     9
city_131     9
city_146     8
city_79      8
city_81      7
city_2       7
city_180     7
city_107     6
city_120     6
city_139     5
city_62      5
city_179     5
city_82      4
city_25      4
city_8       4
city_166     4
city_31      4
city_18      4
city_111     3
city_129     3
city_121     3
city_171     1
city_140     1
Name: city, dtype: int64

In [None]:
len(df.city.unique())

In [67]:
df.city = df.city.apply(lambda x: 'other' if x in cities_20 else x)
len(df.city.unique())

96

## Create dummies [one hot encoding]

In [None]:
df.info()

In [68]:
dummies = pd.get_dummies(df)
dummies.head(3)

Unnamed: 0,city_development_index,training_hours,target,city_city_1,city_city_10,city_city_100,city_city_101,city_city_102,city_city_103,city_city_104,...,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,last_new_job_1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_never
0,0.92,36,1.0,0,0,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
1,0.776,47,0.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0.624,83,0.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


# Building Model

In [69]:
X = dummies.drop(['target'],axis='columns')
X.head(3)

Unnamed: 0,city_development_index,training_hours,city_city_1,city_city_10,city_city_100,city_city_101,city_city_102,city_city_103,city_city_104,city_city_105,...,company_type_NGO,company_type_Other,company_type_Public Sector,company_type_Pvt Ltd,last_new_job_1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_never
0,0.92,36,0,0,0,0,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
1,0.776,47,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0.624,83,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [70]:
y = dummies['target']

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [72]:
X_train.shape, X_test.shape

((15326, 148), (3832, 148))

In [73]:
y_train.shape

(15326,)

In [74]:
y_test.shape

(3832,)

In [75]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
log_clf.fit(X_train,y_train)
log_clf.score(X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7753131524008351

In [79]:
log_clf.predict(X_test)[40]

0.0

In [80]:
np.array(y_test)[40]

0.0

In [89]:
from sklearn.metrics import classification_report
print(classification_report(log_clf.predict(X_test),y_test))

              precision    recall  f1-score   support

         0.0       0.92      0.81      0.86      3306
         1.0       0.32      0.56      0.41       526

    accuracy                           0.78      3832
   macro avg       0.62      0.69      0.63      3832
weighted avg       0.84      0.78      0.80      3832



### Saving to run more models

In [83]:
dummies.to_csv('./data/preprocessed_data_for_model_building.csv')

In [84]:
data = pd.read_csv('data/preprocessed_data_for_model_building.csv')

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [86]:
from sklearn.ensemble import RandomForestClassifier

In [87]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [88]:
clf.score(X_test,y_test)

0.7575678496868476