#  Label encoding on 1 variable

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
data['Attrition'] = enc.fit_transform(data['Attrition'])

# One Hot Encoding on 1 variable

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
X=ohe.fit_transform(data['JobRole'].values.reshape(-1,1)).toarray()
onehotdf = pd.DataFrame(X,columns=[i for i in data['JobRole'].unique()])
onehotdf.drop(onehotdf.columns[0], axis=1,inplace=True)
data = pd.concat([data, onehotdf], axis=1)
data= data.drop(['JobRole'], axis=1)

# Label Encoding on Multiple variables

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
for col in new_data.columns:
    data[col] = enc.fit_transform(data[col])

# OneHotEncoding on multiple features

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
for col in new_data1.columns:
    X=ohe.fit_transform(data[col].values.reshape(-1,1)).toarray()
    onehotdf = pd.DataFrame(X,columns=[i for i in data[col].unique()])
    onehotdf.drop(onehotdf.columns[0], axis=1,inplace=True)
    data = pd.concat([data, onehotdf], axis=1)
    data= data.drop([col], axis=1)

# Train and Test Accuracy for classification problem

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

# Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import f1_score, make_scorer
param_grid = {'C': [0.01, 0.1, 0.5, 1, 10, 100,1000], 
              'gamma': [1, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001,.0001,.00001], 
              'kernel': ['rbf','linear']} 

grid = GridSearchCV(SVC(),param_grid,scoring=make_scorer(f1_score), refit=True, verbose=1, cv=5,n_jobs=-1)
grid.fit(X_train, y_train)

best_params = grid.best_params_
print(f"Best params: {best_params}")

svm_clf = SVC(**best_params)
svm_clf.fit(X_train, y_train)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=True)
print_score(svm_clf, X_train, y_train, X_test, y_test, train=False)

# Plotting boxplot for all the columns and checking outliers

In [None]:
fig, axes = plt.subplots(4,3, figsize = (15,15))
axes = axes.flatten()

for i in range(0,len(data.columns)-1):
    sb.boxplot(y=data.iloc[:,i], data=data, orient='v', ax=axes[i])

plt.tight_layout()
plt.show()

# Convert Target variable into number

In [None]:
data.loc[data.Status=='Abnormal','Status'] = 1
data.loc[data.Status=='Normal','Status'] = 0

# Visualizing the ROC Curve

In [None]:
from sklearn import metrics
fpr, tpr, threshholds = metrics.roc_curve(y_test,  y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)
plt.plot(fpr,tpr,label="ROC curve , auc="+str(auc))
plt.legend(loc=4)
plt.show()

# Creating Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression(fit_intercept=True)
regressor.fit(X_train, y_train)

#Coefficients of variables
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
coeff_df

#intercept
regressor.intercept_

# Validating the OLS assumptions

In [None]:
#checking whether mean of error lies at 0
plt.scatter(y_pred, (y_test-y_pred))
plt.xlabel("Fitted values")
plt.ylabel("Residuals")

(y_test-y_pred).mean()

# Evaluating Regression Model

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# StatsModel

In [None]:
import statsmodels.api as sm
X_endog = sm.add_constant(X_train)
X_endog1 = sm.add_constant(X_test)
res = sm.OLS(y_train, X_endog)
res.fit()
res.fit().summary()

# Z-Score method for Outliers

In [None]:
upper_limit = dataset.Paved_Highways.mean() + 3*dataset.Paved_Highways.std()
lower_limit = dataset.Paved_Highways.mean() - 3*dataset.Paved_Highways.std()

#checking the outliers
dataset[(dataset['Paved_Highways']>upper_limit) | (dataset['Paved_Highways']<lower_limit)]

#Removing the outliers
new_data = data[(data['Paved_Highways']<upper_limit) & (data['Paved_Highways']>lower_limit)]

# Checking the interaction effect

In [None]:
from sklearn.preprocessing import PolynomialFeatures
x_interaction = PolynomialFeatures(2,interaction_only=True,include_bias=False).fit_transform(x)
interaction_df = pd.DataFrame(x_interaction,columns=['TV','Radio','Newspaper','TV:Radio','TV:newspaper','Radio:Newspaper'])
from statsmodels.regression import linear_model
interaction_model = linear_model.OLS(y,interaction_df).fit()
interaction_model.pvalues[interaction_model.pvalues<0.05]


data['TV:Radio'] = data['TV']*data['Radio']
data['TV:Newspaper'] = data['TV']*data['Newspaper']
data['Radio:Newspaper'] = data['Radio']*data['Newspaper']

# Visualizing outliers using boxplot

In [None]:
fig, ax = plt.subplots(figsize = (15,10))
sns.boxplot(data = creditcard_df,width=0.5,ax=ax,fliersize=3)

# Elbow method and sillhout method for selecting no of clusters

In [None]:
figure = plt.figure(figsize=(16, 9))
elbow = figure.add_subplot(1,2,1) #elbow chart
kmean_sil = figure.add_subplot(1,2,2) #silhouette bar chart

n_clusters=19
cost=[]
for i in range(1,n_clusters):
    kmean= KMeans(i)
    kmean.fit(creditcard_df_scaled)
    cost.append(kmean.inertia_) 
    elbow.set_ylabel('Sum of Squared Errors', fontsize = 15)
    elbow.set_xlabel('Number of Clusters', fontsize = 15)
    elbow.set_title('K-MEANS Clustering SSE: Elbow Chart', fontsize = 15)
    elbow.plot(cost, 'bx-')


silhouette_scores = [] 

for n_cluster in range(2, 20):
    silhouette_scores.append( 
        silhouette_score(creditcard_df_scaled, KMeans(n_clusters = n_cluster).fit_predict(creditcard_df_scaled))) 
    
# Plotting a bar graph to compare the results 
k = [2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19] 
kmean_sil.bar(k, silhouette_scores) 
kmean_sil.set_title('K-MEANS: Number of Clusters vs. Silhouette Score', fontsize = 15)
kmean_sil.set_xlabel('Number of Clusters', fontsize = 15) 
kmean_sil.set_ylabel('Silhouette Score', fontsize = 15)

# Concatenating the cluster to each data point

In [None]:
# Build the KMeans model
kmeans = KMeans(5,random_state=6)
kmeans.fit(creditcard_df_scaled)
labels = kmeans.labels_

creditcard_df_cluster = pd.concat([creditcard_df, pd.DataFrame({'cluster':labels})], axis = 1)
creditcard_df_cluster.head()

# No of values in each cluster

In [None]:
pd.crosstab(index=creditcard_df_cluster['cluster'], columns='count')

# Handling the imbalance data

#### Upsampling

In [None]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df_train[df_train.Attrition==0]
df_minority = df_train[df_train.Attrition==1]

print(df_majority.Attrition.count())
print("-----------")
print(df_minority.Attrition.count())
print("-----------")
print(df_train.Attrition.value_counts())

In [None]:
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=600,    # to match majority class
                                 random_state=10) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
# Display new class counts
df_upsampled.Attrition.value_counts()

#### Downsampling

In [None]:
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=200,     # to match minority class
                                 random_state=24) # reproducible results
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
# Display new class counts
df_downsampled.Attrition.value_counts()

#### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=587)
X_SMOTE, y_SMOTE = sm.fit_resample(X_train, y_train)
print(len(y_SMOTE))
print(y_SMOTE.sum())
print(y_SMOTE.value_counts())

# Feature importance using ExtraTreeClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
clf=ExtraTreesClassifier()
clf=clf.fit(x,y)

scores = pd.DataFrame(clf.feature_importances_,columns=['scores'])

features=pd.DataFrame(data.columns,columns=['features'])

sc = pd.concat([features,scores],axis=1)

for i in sc['scores'].sort_values(ascending=False).head(15).index:
    print(sc.loc[i,'features'])

# Method for finding the outliers

In [None]:

def detect_outliers(df,features):
    outlier_indices = []
    
    for c in features:
        #1st quartile
        Q1 = np.percentile(df[c],25)
        #3rd quartile
        Q3 = np.percentile(df[c],75)
        #IQR
        IQR = Q3-Q1
        #Outlier Step
        outlier_step= IQR * 1.5
        #Detect outlier and their indices
        outlier_list_col = df[(df[c]<Q1 - outlier_step) | (df[c]> Q3 + outlier_step)].index
        #store indices
        outlier_indices.extend(outlier_list_col)
        
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i,v in outlier_indices.items() if v > 1)
    
    return multiple_outliers


train_df.loc[detect_outliers(train_df,["Age","Fare"])]

# Creating multiple visuals

In [None]:
plt.figure(figsize=(30,50))
for i in enumerate(cat_col):
    plt.subplot(8,1,i[0]+1)
    sb.countplot(x=i[1],data=data)

# Plotting how every feature co-relate with the target

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize=(30, 30))

for i, column in enumerate(categorical_col, 1):
    plt.subplot(3, 3, i)
    g = sns.barplot(x=f"{column}", y='Attrition', data=df)
    g.set_xticklabels(g.get_xticklabels(), rotation=90)
    plt.ylabel('Attrition Count')
    plt.xlabel(f'{column}')

# Showing missing values using bar chart

In [None]:
plt.figure(figsize=(10,10))
miss_value = pd.DataFrame(train_data.isnull().sum()).reset_index()
miss_value.columns=['Columns','No_of_missing_values']
miss_value = miss_value[miss_value['No_of_missing_values']>0]
sb.barplot(data = miss_value,x='No_of_missing_values',y = 'Columns',orient = 'h')