<a href="https://colab.research.google.com/github/sr606/Machine_Learning_CaseStudies/blob/main/OLA_Ensemble_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings("ignore")

In [3]:
ola = pd.read_csv("/content/drive/MyDrive/Files/ola_driver_scaler.csv")

In [4]:
ola.head()

Unnamed: 0.1,Unnamed: 0,MMM-YY,Driver_ID,Age,Gender,City,Education_Level,Income,Dateofjoining,LastWorkingDate,Joining Designation,Grade,Total Business Value,Quarterly Rating
0,0,01/01/19,1,28.0,0.0,C23,2,57387,24/12/18,,1,1,2381060,2
1,1,02/01/19,1,28.0,0.0,C23,2,57387,24/12/18,,1,1,-665480,2
2,2,03/01/19,1,28.0,0.0,C23,2,57387,24/12/18,03/11/19,1,1,0,2
3,3,11/01/20,2,31.0,0.0,C7,2,67016,11/06/20,,2,2,0,1
4,4,12/01/20,2,31.0,0.0,C7,2,67016,11/06/20,,2,2,0,1


#**Exploratory Data Analysis**

In [5]:
ola.shape

(19104, 14)

In [6]:
print("Rows in the ola dataset:", ola.shape[0])
print("Columns in the ola dataset:", ola.shape[1])

Rows in the ola dataset: 19104
Columns in the ola dataset: 14


In [7]:
ola.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            19104 non-null  int64  
 1   MMM-YY                19104 non-null  object 
 2   Driver_ID             19104 non-null  int64  
 3   Age                   19043 non-null  float64
 4   Gender                19052 non-null  float64
 5   City                  19104 non-null  object 
 6   Education_Level       19104 non-null  int64  
 7   Income                19104 non-null  int64  
 8   Dateofjoining         19104 non-null  object 
 9   LastWorkingDate       1616 non-null   object 
 10  Joining Designation   19104 non-null  int64  
 11  Grade                 19104 non-null  int64  
 12  Total Business Value  19104 non-null  int64  
 13  Quarterly Rating      19104 non-null  int64  
dtypes: float64(2), int64(8), object(4)
memory usage: 2.0+ MB


**Column Profiling:**



1.   MMM - YY: Reporting Date (Monthly) (date-time)
2.   Age: Age of the employee(numerical)
3. Gender: Gender of the employee(categorical)
4. City: City Code of the employee(categorical)
5. Education_Level: Education level - 0 for 10+ , 1 for 12+, 2 for graduate (categorical)

6. Income : Monthly average Income of the employee (numerical)

7.  Date Of Joining : Joining date for the employee (date-time)

8. LastWorkingDate : Last date of working for the employee - Target Feature (date-time, but will be converted to categorical)

9. Joining Designation : Designation of the employee at the time of joining (categorical, ordinal)

10.  Grade : Grade of the employee at the time of reporting (categorical, ordinal)

11. Total Business Value : The total business value acquired by the employee in a month (negative business indicates cancellation/refund or car EMI adjustments) (numerical)

12.  Quarterly Rating : Quarterly rating of the employee: 1,2,3,4,5 (categorical, ordinal - higher is better)



In [8]:
ola.describe()

Unnamed: 0.1,Unnamed: 0,Driver_ID,Age,Gender,Education_Level,Income,Joining Designation,Grade,Total Business Value,Quarterly Rating
count,19104.0,19104.0,19043.0,19052.0,19104.0,19104.0,19104.0,19104.0,19104.0,19104.0
mean,9551.5,1415.591133,34.668435,0.418749,1.021671,65652.025126,1.690536,2.25267,571662.1,2.008899
std,5514.994107,810.705321,6.257912,0.493367,0.800167,30914.515344,0.836984,1.026512,1128312.0,1.009832
min,0.0,1.0,21.0,0.0,0.0,10747.0,1.0,1.0,-6000000.0,1.0
25%,4775.75,710.0,30.0,0.0,0.0,42383.0,1.0,1.0,0.0,1.0
50%,9551.5,1417.0,34.0,0.0,1.0,60087.0,1.0,2.0,250000.0,2.0
75%,14327.25,2137.0,39.0,1.0,2.0,83969.0,2.0,3.0,699700.0,3.0
max,19103.0,2788.0,58.0,1.0,2.0,188418.0,5.0,5.0,33747720.0,4.0


In [9]:
ola.describe(include= 'object')

Unnamed: 0,MMM-YY,City,Dateofjoining,LastWorkingDate
count,19104,19104,19104,1616
unique,24,29,869,493
top,01/01/19,C20,23/07/15,29/07/20
freq,1022,1008,192,70


In [10]:
ola.drop(columns = 'Unnamed: 0', axis=1, inplace=True)

In [11]:
ola.nunique()

Unnamed: 0,0
MMM-YY,24
Driver_ID,2381
Age,36
Gender,2
City,29
Education_Level,3
Income,2383
Dateofjoining,869
LastWorkingDate,493
Joining Designation,5


In [12]:
ola.isna().sum()

Unnamed: 0,0
MMM-YY,0
Driver_ID,0
Age,61
Gender,52
City,0
Education_Level,0
Income,0
Dateofjoining,0
LastWorkingDate,17488
Joining Designation,0


###**DATA PROCESSING AND FEATURE ENGINEERING**

In [16]:
ola1.head()

Unnamed: 0,MMM-YY,Driver_ID,Age,Gender,City,Education_Level,Income,Dateofjoining,LastWorkingDate,Joining Designation,Grade,Total Business Value,Quarterly Rating
0,01/01/19,1,28.0,0.0,C23,2,57387,24/12/18,,1,1,2381060,2
1,02/01/19,1,28.0,0.0,C23,2,57387,24/12/18,,1,1,-665480,2
2,03/01/19,1,28.0,0.0,C23,2,57387,24/12/18,03/11/19,1,1,0,2
3,11/01/20,2,31.0,0.0,C7,2,67016,11/06/20,,2,2,0,1
4,12/01/20,2,31.0,0.0,C7,2,67016,11/06/20,,2,2,0,1


In [13]:
ola1 = ola.copy(deep=True)

In [14]:
## Target variable creation: Create a column called target which tells whether the driver has left the company

## diver whose last working day is present will have 1
first = (ola1.groupby('Driver_ID').agg({'LastWorkingDate':'last'})['LastWorkingDate'].isna()).reset_index()
first['LastWorkingDate'].replace({True:1, False:0}, inplace = True)
first.rename(columns={'LastWorkingDate': 'target'}, inplace = True)
first.head()

Unnamed: 0,Driver_ID,target
0,1,0
1,2,1
2,4,0
3,5,0
4,6,1


In [17]:
# Create column which tells whether the quaterly rating has increased for that driver -
# for those whose quaterly rating has increased we assign the value 1
QR1 = (ola1.groupby('Driver_ID').agg({'Quarterly Rating': 'first'})['Quarterly Rating']).reset_index()
QR2 = (ola1.groupby('Driver_ID').agg({'Quarterly Rating': 'last'})['Quarterly Rating']).reset_index()

In [18]:
QR1.shape,QR2.shape

((2381, 2), (2381, 2))

In [19]:
QR1.isna().sum(),QR2.isna().sum()

(Driver_ID           0
 Quarterly Rating    0
 dtype: int64,
 Driver_ID           0
 Quarterly Rating    0
 dtype: int64)

In [21]:
first = first.merge(QR1, on='Driver_ID')
first = first.merge(QR2, on = 'Driver_ID')

In [22]:
first.head()

Unnamed: 0,Driver_ID,target,Quarterly Rating_x,Quarterly Rating_y
0,1,0,2,2
1,2,1,1,1
2,4,0,1,1
3,5,0,1,1
4,6,1,1,2
