**There are a hell lot of things that one can do while analysing his or her data. Don't believe me? Just go through this notebook once and you will realise the extents to which one can go while analysing a data as short as 300 rows.**

**Below is an analysis of a small data on Attrition of employees. It includes the following:**

1. Data Preparation 
2. Exploratory data analysis
3. Clustering to find patterns
4. Frequent pattern mining
5. Forecasting and Predictions

# DATA PREPARATION

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [None]:
main_data = pd.read_csv('../input/attritiondata/Attrition_data.csv')

In [None]:
main_data.head()

In [None]:
location_clean = pd.read_csv('../input/attritiondata/location_clean.csv')
location_clean.head()

In [None]:
data = pd.merge(main_data, location_clean, how= 'inner',left_on = 'S.No', right_on='id' )
data.drop('id',axis =1, inplace = True)
# data.head()

In [None]:
assert location_clean.shape[0] == data.shape[0]

In [None]:
data.shape

### Converting the columns into right datatypes and extracting data

In [None]:
data = data.rename(columns = {'Engagement Score (% Satisfaction)':'sat_score'})
data['sat_score'] = data['sat_score'].apply(lambda x:x[:-1])
# data.head()

In [None]:
data['sat_score'] = data['sat_score'].astype('int')

In [None]:
# np.where(data['Location'].isna())
# data.loc[[48,111],:]
# data.drop([48,111],axis = 0, inplace= True) #For now
# data.isna().sum().sum()

In [None]:
data.isna().sum()

> Use the below statement when modelling or when null values can create problem

In [None]:
# data = data[data['doubtful']=='NO']
# data.isna().sum()

In [None]:
# np.where(data.isna())

In [None]:
# data.iloc[[  2,  23,  63, 193],:]

> The 4 admin locations have to be handled for districts

In [None]:
data['Last Rating'] = data['Last Rating'].apply(lambda x: str(x))
# data.info()

In [None]:
def to_float(x):
    try:
        return float(x)
    except ValueError as v:
        return float(x.replace(' ',''))
data['Tenure'] = data['Tenure'].apply(to_float)
# data.head()

In [None]:
from datetime import datetime
def converter(x):
    try:
        return datetime.strptime(x, '%d-%b-%y')
    except:
        return datetime.strptime(x, '%d-%m-%y')  # for these values in DOJ column '''05-07-10,02-09-10,01-08-11,12-03-04,05-07-10,,01-06-11,09-08-07,05-05-08,12-10-09,07-02-11'''

data['DOL_date'] = data['In Active Date'].apply(converter)
data['DOJ_date'] = data['DOJ'].apply(converter)
# data.head()

In [None]:
data.drop(['DOJ','In Active Date'], axis = 1, inplace =True)

In [None]:
def safe_strip(x):
    try:
        return x.strip()
    except AttributeError as e:
#         print(x)
        return x
    
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].apply(safe_strip) ## Some values in the Designation column had extra spaces 

In [None]:
data.groupby('Designation')['Grade'].apply(lambda x: x.unique())
### OR data[['Designation','Grade']].drop_duplicates().sort_values('Grade')

> 1:1 relation between designation and grade. SO one can be dropped. Dropping Designation as it is easy to find order in Grade

In [None]:
data['Zone'] = data['Zone'].apply(lambda x: x.lower()) ## CENTRAL and central, north and North, south and South pairs were present
data['Zone'].value_counts()

In [None]:
data['Marital Status'].value_counts()

In [None]:
data['Gender'].value_counts()

In [None]:
data['Education'].value_counts()

### Final Check 
1. S.No
2. EmpID - To be dropped. Useless
3. Emp Name - To be dropped. Useless*
4. Designation - Stripped Extra space and then dropped - Has 1 to 1 with 'Grade'
5. Grade - Has 1 to 1 with 'Designation'
6. Attrition - To be dropped. Useless
7. *Location ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ TO BE WORKED UPON*
8. Tenure - Converted datatype. Fixed some values having space in between
9. Gender 
10. Education
11. Age
12. Last Rating - changed dtype to object
13. Monthly Income
14. sat_score - Removed % sign and converted to int
15. Marital Status
16. Zone - Lower cased values
17. Remarks 
18. In Active Date- dropped. Instead created DOL_date having datetime datatype
19. DOJ - dropped . Instead created DOJ_date having datetime datatype.

In [None]:
data.columns

> Dropping EmpID, Emp Name as they are redundant in the presence of a S.No. 

> Attrition is always "YES"

> Designation is redundant in presence of grade

In [None]:
data.drop(['EmpID','Emp Name','Attrition ','Designation'],axis =1 , inplace =True)
data.head()

### Feature Engineering

In [None]:
data['tenure_days'] = (data['DOL_date'] - data['DOJ_date']).apply(lambda x:x.days)
# data.head()

In [None]:
data.columns

In [None]:
data = data.rename(columns = {'S.No':'id', 'Last Rating':'rating','Monthly Income':'income','Marital Status': 'marital_status'})
# data.head()

In [None]:
data.columns = [col.lower() for col in data.columns]
# data.head()

In [None]:
# data.columns

In [None]:
data = data.drop(['doubtful','location','changed'] ,axis = 1)

> Use `corrected_location` instead of `location`

> No need of `changed` as `location` column has been dropped

> Leave `doubtful` in the dataset 

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.to_csv('data_complete_location.csv', index= False)

# EXPLORATORY DATA ANALYSIS

### Univariate visualization

In [None]:
numeric_col = [col for col in data.columns if data[col].dtype in ['int64','int32','float64'] and col not in ['id','tenure']]
numeric_col

In [None]:
data[numeric_col].hist(figsize=(16, 8));

In [None]:
_, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 8))
i = 0
j = 0
for col in numeric_col:
    _=sns.distplot(data[col], ax=axes[i][j]);
    _=plt.xticks(rotation=90)
    j+=1
    if j==2:
        i+=1
        j=0

In [None]:
_, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 8))
i = 0
j = 0
for col in numeric_col:
    _=sns.boxplot(data[col], ax=axes[i][j]);
    _=plt.xticks(rotation=90)
    j+=1
    if j==2:
        i+=1
        j=0

In [None]:
cat_cols = [col for col in data.columns if data[col].dtype == 'object']
cat_cols

In [None]:
%matplotlib inline
_, axes = plt.subplots(nrows=5, ncols=2,sharey=True, figsize=(16, 24))
# plt.subplot_tool() ## Works for interactive
plt.subplots_adjust(hspace=0.8)
i = 0
j = 0
for col in cat_cols:
    if col == 'location': continue
    g=sns.countplot(x=col, data=data, ax=axes[i][j], order = list(data[col].value_counts().reset_index()['index']));
    if col in  ['remarks','corrected_location','district','state']:
        _=g.set_xticklabels(g.get_xticklabels(), rotation=90)
#     _ = plt.xticks(rotation=90)
    j+=1
    if j==2:
        i+=1
        j=0


### Multivariate visualization

In [None]:
corr_matrix = data[numeric_col].corr()
sns.heatmap(corr_matrix, annot = True);

In [None]:
# `pairplot()` may become very slow with the SVG or retina format
%config InlineBackend.figure_format = 'png'
sns.pairplot(data[numeric_col]);
%config InlineBackend.figure_format = 'retina'

In [None]:
numeric_col

In [None]:
cat_cols

In [None]:
_, axes = plt.subplots(nrows=5, ncols=2,sharey=True, figsize=(16, 30))
plt.subplots_adjust(hspace=0.8)

i = 0
j = 0
for col in cat_cols:
    if col == 'location': continue
    g=sns.boxplot(x=col,y='tenure_days', data=data, ax=axes[i][j]);
    if col in  ['remarks','corrected_location','district','state']:
        _=g.set_xticklabels(g.get_xticklabels(), rotation=90)
#     _ = plt.xticks(rotation=90)
    j+=1
    if j==2:
        i+=1
        j=0


In [None]:
# _, axes = plt.subplots(2, 4, sharey=True, figsize=(12, 8))
plt.figure(figsize= (8,6))
sns.boxplot(x='grade', y='income', data=data[data['income']<1e5], order = sorted(data['grade'].unique()));

# CLUSTERING

In [None]:
%reset -f

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline 
import seaborn as sns
sns.set()

In [None]:
data = pd.read_csv('../input/attritiondata/data_complete_location.csv')
data.head()

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# Filter data
left_emp =  data[['sat_score', 'rating']]
# Create groups using K-means clustering.

ss= StandardScaler()
left_emp_scaled = ss.fit_transform(left_emp)
left_emp_scaled.shape
kmeans = KMeans(n_clusters = 4, random_state = 10).fit(left_emp_scaled)

In [None]:
left_emp['label'] = kmeans.labels_
# Draw scatter plot
_ = plt.scatter(left_emp['sat_score'], left_emp['rating'], c=left_emp['label'],cmap='Accent')
_ = plt.xlabel('Satisfaction Level')
_ = plt.ylabel('Last Evaluation')
_ = plt.title('4 Clusters of employees who left')
plt.show()

In [None]:
left_emp =  data[['tenure', 'income']]
left_emp = left_emp[left_emp['income']<1e5]
# Create groups using K-means clustering.

ss= StandardScaler()
left_emp_scaled = ss.fit_transform(left_emp)
left_emp_scaled.shape
kmeans = KMeans(n_clusters =4 , random_state = 10).fit(left_emp_scaled)

left_emp['label'] = kmeans.labels_
# Draw scatter plot
_ = plt.scatter(left_emp['tenure'], left_emp['income'], c=left_emp['label'],cmap='Accent')
_ = plt.xlabel('Tenure')
_ = plt.ylabel('Income')
_ = plt.title('4 Clusters of employees who left')
plt.show()

In [None]:
left_emp =  data[['age', 'income']]
left_emp = left_emp[left_emp['income']<1e5]
# Create groups using K-means clustering.

ss= StandardScaler()
left_emp_scaled = ss.fit_transform(left_emp)
left_emp_scaled.shape
kmeans = KMeans(n_clusters =6 , random_state = 10).fit(left_emp_scaled)

left_emp['label'] = kmeans.labels_
# Draw scatter plot
_=plt.scatter(left_emp['age'], left_emp['income'], c=left_emp['label'],cmap='Accent')
_=plt.xlabel('Age')
_=plt.ylabel('Income')
_=plt.title('6 Clusters of employees who left')
plt.show()

# FREQUENT PATTERN MINING

In [None]:
%reset -f

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline 
import seaborn as sns
sns.set()

In [None]:
data = pd.read_csv('../input/attritiondata/data_complete_location.csv')
data.head()

In [None]:
grade_int = {'E1':1,'E2':2,'M1':3,'M2':4,'M3':5,'M4':6,'CXO':7}
data['grade_int'] = data['grade'].apply(lambda x: grade_int[x])

In [None]:
not_required =  ['grade','dol_date','doj_date','id','corrected_location','district']

In [None]:
selected_cats = [ col for col in data.columns if data[col].dtype=='object' and col not in not_required]
selected_cats

In [None]:
selected_nums = [col for col in data.columns if col not in selected_cats+not_required]

In [None]:
selected_nums

## Frequent Item Sets

Some points to be noted:

1. Income is dependent on the grade of the employee.
2. Age and income are positively correlated
3. Due to the above two points, only grade is considered for the frequent item sets calculation
4. Tenure and sat_score are binned so as to be used for frequent itemset calculation purpose.

In [None]:
sns.distplot(data['tenure'])

In [None]:
sns.distplot(data['sat_score'])

In [None]:
def sat_binner(x):
    return x//20 + 1 if not x%20 == 0 else x//20
data['sat_binned'] = data['sat_score'].apply(sat_binner).astype('object')

In [None]:
def tenure_binner(x):
    return x//2 + 1 if not x%2 == 0 else x//2
data['tenure_binned'] = data['tenure'].apply(tenure_binner).astype('object')

In [None]:
cols_for_frequent_items = ['grade','gender','education','rating','marital_status','zone','remarks','tenure_binned','sat_binned']

In [None]:
data_fp = data[cols_for_frequent_items]
# data_fp.head()

In [None]:
data_fp_enc = pd.get_dummies(data_fp, columns = data_fp.columns)
data_fp_enc.head()

In [None]:
pd.set_option('max_colwidth', 100)
# pd.set_option('max_rows',200)

In [None]:
from mlxtend.frequent_patterns import apriori

freq_pattern = apriori(data_fp_enc, min_support=0.20, use_colnames=True)
freq_pattern['length'] = freq_pattern['itemsets'].apply(lambda x: len(x) )
freq_pattern[freq_pattern['length']>=4].sort_values('support',ascending= False)

In [None]:
fp2 = data[(data['gender']== 'Male') & (data['grade']=='E1') & (data['education'] =='Bachelors') & (data['tenure']<=2) ]
fp2.groupby('remarks').size().sort_values(ascending = False)
# fp2.groupby('zone').size().sort_values(ascending = False)
# fp2.groupby('rating').size().sort_values(ascending = False)
# fp2.groupby('sat_binned').size().sort_values(ascending = False)
# fp2.groupby('marital_status').size().sort_values(ascending = False)

In [None]:
# Interesting FP 
fp1 = data[(data['gender']== 'Male') & (data['grade']=='E1') & (data['education'] =='Bachelors') & (data['remarks']=='Issues with the Manager') ]

# Not so interesting other features
# fp1.groupby('marital_status').size().sort_values(ascending = False)
# fp1.groupby('zone').size().sort_values(ascending = False)
# fp1.groupby('rating').size().sort_values(ascending = False)
# fp1.groupby('sat_binned').size().sort_values(ascending = False)
# fp1.groupby('tenure_binned').size().sort_values(ascending = False)


In [None]:
#Not very interesting
# fp3 = data[(data['gender']== 'Male') & (data['grade']=='E1') & (data['education'] =='Bachelors') & (data['remarks']=='Issues with the Manager') &  (data['tenure']<=2)]
# fp3.groupby('zone').size().sort_values(ascending = False)
# fp3.groupby('rating').size().sort_values(ascending = False)
# fp3.groupby('sat_binned').size().sort_values(ascending = False)
# fp3.groupby('marital_status').size().sort_values(ascending = False)

# TENURE PREDICTION

In [None]:
%reset -f

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
# from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
# from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
from math import sqrt
np.random.seed(42)

In [None]:
data = pd.read_csv('../input/attritiondata/data_complete_location.csv')
data.head()

In [None]:
# data_used = data.drop(['corrected_location','district','state','doj_date','dol_date','tenure_days'], axis =1)
data_pred = data[['id','grade','tenure','gender','education','age','rating','income','sat_score','marital_status',\
                 'zone','remarks']]
# data_pred.info()

In [None]:
X = data_pred.drop(['id','tenure'], axis =1)
y = data_pred['tenure']

In [None]:
selected_cats = [col for col in X.columns if X[col].dtype == 'object']
selected_nums = [col for col in X.columns if col  not in selected_cats]

In [None]:
X = pd.get_dummies(X, columns = selected_cats)

In [None]:
train_samples = int(0.9*data_pred.shape[0])
train_indices = list(range(train_samples))
val_indices = list(range(train_samples, data_pred.shape[0]))
train_X = X.loc[train_indices, : ]
train_y = y.loc[train_indices]
val_X = X.loc[val_indices, : ]
val_y = y.loc[val_indices]

train_X.shape
train_y.shape
val_X.shape
val_y.shape

In [None]:
ss= StandardScaler()
train_X_scaled = pd.DataFrame(ss.fit_transform(train_X), columns = train_X.columns)
train_y_logged = np.log1p(train_y)
val_X_scaled = pd.DataFrame(ss.transform(val_X), columns = val_X.columns)
# val_y = np.log1p(val_y)

In [None]:
def fit_model(model):
    if model == DecisionTreeRegressor:
        reg = model(random_state = 291)
    else:
        reg = model()
    reg.fit(train_X_scaled, train_y_logged)
    val_y_hat = np.expm1(reg.predict(val_X_scaled))
    print(f'MAE: {mean_absolute_error(val_y_hat, val_y)}')
    print(f'RMSE: {sqrt(mean_squared_error(val_y_hat, val_y))}')
#     return sqrt(mean_squared_error(val_y_hat, val_y))
    fig, ax = plt.subplots(1,2, figsize=(16,4))
    
    ax[0].plot(list(range(len(val_y))), val_y_hat, label= 'Predicted Tenure (in yrs)')
    ax[0].plot(list(range(len(val_y))), val_y, label = 'Original  Tenure (in yrs)')
    ax[0].legend(loc = 'best')
    ax[0].set_title('Predictions')
    
    print(f'Using model : {model}')
    if model in [Lasso, Ridge, LinearRegression]:
        coeff_df = pd.DataFrame(reg.coef_, train_X_scaled.columns, columns=['Coefficient'])  

    elif model in [XGBRegressor,DecisionTreeRegressor]:
        coeff_df = pd.DataFrame(reg.feature_importances_, train_X_scaled.columns, columns=['Coefficient'])  
        
    else:
        print("No feature importance graph for DummyRegressor")
        return 
    
    coeff_df["abs"] = coeff_df.Coefficient.apply(np.abs)
    coeff_df = coeff_df.sort_values(by="abs", ascending=False).drop("abs", axis=1)
    
    ax[1].bar(coeff_df.index[:15],coeff_df['Coefficient'][:15])
    _ = plt.xticks(rotation=90)
    ax[1].set_title('Feature importance')


In [None]:
fit_model(DummyRegressor)

In [None]:
fit_model(LinearRegression)

In [None]:
fit_model(DecisionTreeRegressor)

In [None]:
## To visualize the Decision Tree - But the tree is too big

# dt =DecisionTreeRegressor()
# dt.fit(train_X_scaled, train_y_logged)
# from sklearn.tree import export_graphviz
# export_graphviz(dt, out_file ='tree.dot', 
#                feature_names =train_X_scaled.columns)  

In [None]:
fit_model(XGBRegressor)

In [None]:
def plot_ensemble(model1, model2):
    if model1 == DecisionTreeRegressor:
        m1  = model1(random_state=291)
    else:
        m1 = model1()
    m1.fit(train_X_scaled, train_y_logged)
    m2 = model2()
    m2.fit(train_X_scaled, train_y_logged)
    val_y_hat = (np.expm1(m1.predict(val_X_scaled)) + np.expm1(m2.predict(val_X_scaled)))/2.0
    print(f'MAE: {mean_absolute_error(val_y_hat, val_y)}')
    print(f'RMSE: {sqrt(mean_squared_error(val_y_hat, val_y))}')
    
    fig, ax = plt.subplots(1,1)
    
    ax.plot(list(range(len(val_y))), val_y_hat, label= 'Predicted Tenure (in yrs)')
    ax.plot(list(range(len(val_y))), val_y, label = 'Original  Tenure (in yrs)')
    ax.legend(loc = 'best')
    ax.set_title('Predictions')

In [None]:
plot_ensemble(LinearRegression, XGBRegressor)

In [None]:
plot_ensemble(DecisionTreeRegressor, XGBRegressor)

# FORECAST COUNT

In [None]:
%reset -f

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [None]:
from math import sqrt
from scipy.stats import mode
np.random.seed(42)

In [None]:
data = pd.read_csv('../input/attritiondata/data_complete_location.csv')
data.head()

In [None]:
X = data[['dol_date']].copy()
X['Count'] = 1
# X.head()

In [None]:
# data_used = data.drop(['corrected_location','district','state','doj_date','dol_date','tenure_days'], axis =1)
X['dol_date'] = pd.to_datetime(X["dol_date"], format="%Y-%m-%d")

In [None]:
X.set_index('dol_date', inplace = True)

In [None]:
X_comp =  X.Count.resample('D').sum().reset_index()

In [None]:
X_comp["date"] = X_comp["dol_date"].apply(lambda x: x.day)
X_comp["month"] = X_comp["dol_date"].apply(lambda x: x.month)
X_comp["quarter"] = X_comp["dol_date"].apply(lambda x: x.quarter)
X_comp["year"] = X_comp["dol_date"].apply(lambda x: x.year)
X_comp["weekday"] = X_comp["dol_date"].apply(lambda x: x.dayofweek)
X_comp["dayofyear"] = X_comp["dol_date"].apply(lambda x: x.dayofyear)
X_comp["weekofyear"] = X_comp["dol_date"].apply(lambda x: x.weekofyear)
# X_comp.head()
# X_comp["day_count"] = X_comp["dol_date"].apply(lambda x: x.toordinal())

In [None]:
X_comp['fired'] = X_comp['Count'].apply(lambda x: 1 if x>=1 else 0)

In [None]:
y = X_comp['fired']
X_comp.drop(['fired','Count','dol_date'],axis = 1, inplace =True)
# X_comp.head()

In [None]:
y.value_counts()

In [None]:
train_samples = int(0.9*X_comp.shape[0])
train_indices = list(range(train_samples))
val_indices = list(range(train_samples, X_comp.shape[0]))
train_X = X_comp.loc[train_indices, : ]
train_y = y.loc[train_indices]
val_X = X_comp.loc[val_indices, : ]
val_y = y.loc[val_indices]

train_X.shape
train_y.shape
val_X.shape
val_y.shape

In [None]:
ss= StandardScaler()
train_X_scaled = pd.DataFrame(ss.fit_transform(train_X), columns = train_X.columns)
# train_y_logged = np.log1p(train_y)
val_X_scaled = pd.DataFrame(ss.transform(val_X), columns = val_X.columns)
# val_y = np.log1p(val_y)

In [None]:
def fit_model(model):
    if model == DecisionTreeClassifier:
        reg = model(random_state = 1)
       
    elif model == DummyClassifier:
        reg = model(strategy = 'constant' ,constant=1)

    elif model == XGBClassifier:
        reg = model()
#         base_score=0.5, booster='gbtree', colsample_bylevel=1,
#               colsample_bytree=0.6, gamma=0.25, learning_rate=0.4,
#               max_delta_step=0, max_depth=10, min_child_weight=1, missing=None,
#               n_estimators=100, n_jobs=1, nthread=None,
#               objective='binary:logistic', random_state=0, reg_alpha=0,
#               reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
#               subsample=1
    else:
        reg = model()
    reg.fit(train_X_scaled, train_y) #Changed var name
    val_y_hat = reg.predict(val_X_scaled)
    
#     return f1_score(val_y_hat, val_y)
    
    print(f'F1: {f1_score(val_y_hat, val_y)}')
    print(f'Accuracy: {accuracy_score(val_y_hat, val_y)}')
    print(f'Precision: {precision_score(val_y_hat, val_y)}')
    print(f'Recall: {recall_score(val_y_hat, val_y)}')

    
    fig, ax = plt.subplots(1,2, figsize=(16,4))
    
    ax[0].plot(list(range(len(val_y))), val_y_hat, label= 'Predicted Firing') #Removed exponentiation
    ax[0].plot(list(range(len(val_y))), val_y, label = 'Data') #Removed exponentiation
    ax[0].legend(loc = 'best')
    ax[0].set_title('Predictions')
    
    print(f'Using model : {model}')
    if model == LogisticRegression:
        coeff_df = pd.DataFrame(reg.coef_[0], train_X_scaled.columns, columns=['Coefficient'])  

    elif model in [XGBClassifier,DecisionTreeClassifier]:
        coeff_df = pd.DataFrame(reg.feature_importances_, train_X_scaled.columns, columns=['Coefficient'])  
        
    else:
        print("No feature importance graph for DummyRegressor")
        return 
    
    coeff_df["abs"] = coeff_df.Coefficient.apply(np.abs)
    coeff_df = coeff_df.sort_values(by="abs", ascending=False).drop("abs", axis=1)
    
    ax[1].bar(coeff_df.index[:15],coeff_df['Coefficient'][:15])
    _ = plt.xticks(rotation=90)
    ax[1].set_title('Feature importance')


In [None]:
fit_model(DummyClassifier)

In [None]:
fit_model(LogisticRegression)

In [None]:
fit_model(DecisionTreeClassifier)

In [None]:
fit_model(XGBClassifier)

In [None]:
def plot_ensemble(model1, model2, model3):
    m1 = model1(random_state=11)
    m1.fit(train_X_scaled, train_y)
    m2 = model2()
    m2.fit(train_X_scaled, train_y)
    m3 = model3()
    m3.fit(train_X_scaled, train_y)    
    val_y_hat = mode([m1.predict(val_X_scaled),m2.predict(val_X_scaled), m3.predict(val_X_scaled)])[0][0]
    
    print(f'F1: {f1_score(val_y_hat, val_y)}')
    print(f'Accuracy: {accuracy_score(val_y_hat, val_y)}')
    print(f'Precision: {precision_score(val_y_hat, val_y)}')
    print(f'Recall: {recall_score(val_y_hat, val_y)}')

    
    fig, ax = plt.subplots(1,1, figsize=(6,4))
    
    ax.plot(list(range(len(val_y))), val_y_hat, label= 'Predicted Firing') #Removed exponentiation
    ax.plot(list(range(len(val_y))), val_y, label = 'Data') #Removed exponentiation
    ax.legend(loc = 'best')
    ax.set_title('Predictions')

In [None]:
plot_ensemble(DecisionTreeClassifier, XGBClassifier,LogisticRegression)