# Project Name

## Business understanding

## Data retrieval

In [None]:
# import useful packages
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, RobustScaler, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.neighbors import KNebighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import plot_confusion_matrix
from sklearn.cluster import KMeans
import statsmodels.api as sm
import re
from imblearn.over_sampling import SMOTE

In [None]:
# load data as a DataFrame
df = pd.read_csv(path, sep=',')

In [None]:
# check number of observations and potential features
df.shape 

In [None]:
# check 5 first observations
df.head()

In [None]:
# check datatypes and number of empty values per row 
df.info()

## Data preparation

In [None]:
# perform exploratory data analysis 
prof = ProfileReport(df)

In [None]:
# show missing values by column
data.isnull().sum().sort_values(ascending=False)/len(data) #NaN percentage for each column

In [None]:
# set to lowercase
df.columns = df.columns.str.lower()
# remove a particular character
df.columns = df.columns.str.strip("'")

In [None]:
# check for duplicates
print(df.duplicated().sum())
# drop duplicates
df.drop_duplicates(inplace=True)

In [None]:
# transforming object to float
df.column = pd.to_numeric(df.column, errors='coerce')

In [None]:
#transforming object to datetime
df.column = pd.to_datetime(df.column, format='%Y-%m-%d')

In [None]:
#remove observations with empty value for provided column
df = df[~df[column].isna()]
df = df[~df[column].isnull()]

In [None]:
# replace missing values
# mean, median, most_frequent, constant (use fill_value)
simpleimputer = SimpleImputer(strategy='mean')
df[column] = simpleimputer.fit_transform(df[[column]])

In [None]:
# scaling of numerical variables for efficiency and better results
def cat_preprocessing(num_variables):
    for column in num_variables:
        minmaxscaler = MinMaxScaler()
        df[column] = minmaxscaler.fit_transform(df[[column]])
        robustscaler = RobustScaler()
        df[column] = robustscaler.fit_transform(df[[column]])
        standardscaler = StandardScaler()
        df[column] = standardscaler.fit_transform(df[[column]])

In [None]:
# one hot encoding of categorical variables
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
df[new_column], df[other_new_column] = ohe.fit_transform(df[[column]]).T

In [None]:
# convert all the categorical variables into dummy variables
df_dummies = pd.get_dummies(df)

In [None]:
# discretizing
data[cat_column] = pd.cut(x = data[num_column],
                       bins=[data[num_column].min()-1,
                             data[num_column].mean(),
                             data[num_column].max()+1], 
                       labels=['cheap', 'expensive'])

data.head()

In [None]:
# useful data manipulations
df.groupby(column).sum()[other_column]
df.cumsum(axis=0)
df.set_index(column, inplace=True)
df.sort_values(by=column, ascending=False, inplace=True)
df[column].argmin()
df[column].argmax()

In [None]:
# LINE PLOTS
plt.plot(x=df.col1, y=df.col2, c='red', ls='--', lw='0.5')
sns.lineplot(data=df, x='col1', y='col2', hue='col3', size='col4')
# DISTRIBUTIONS
plt.hist()   
sns.histplot()
sns.kdeplot()
sns.jointplot()
# SCATTER PLOTS
plt.scatter()
sns.scatterplot()
sns.regplot()
# COUNT PLOTS
sns.countplot() 
# CAT PLOTS
plt.bar() # eq. plt.plot(kind=‘bar’)
sns.barplot() # eq. catplot(kind=“bar”)
sns.violinplot() # eq. catplot(kind=“violin”)
sns.boxplot() # eq. catplot(kind=“box”)

In [None]:
# FACET GRID
g = sns.FacetGrid(data=df, col='col1')
g.map(plt.hist, 'col2')
# DATAFRAME-LEVEL MULTI CORRELATIONS
sns.heatmap(df.corr())
sns.pairplot(hue='')
## 2D HISTOGRAMS
plt.hist2d()
plt.colorbar()
sns.jointplot(x,y, kind='kde', data=df)
## 2D PROJECTION
plt.contour(X,Y,Z) # iso lines
plt.contourf(X,Y,Z=f(X,Y)) # area colors

In [None]:
# create heatmap
corr = df.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        cmap= "YlGnBu")

In [None]:
# feature permutation
log_model = LogisticRegression(class_weight='balanced').fit(X, y) # Fit model
permutation_score = permutation_importance(log_model, X, y, n_repeats=10) # Perform Permutation
importance_df = pd.DataFrame(np.vstack((X.columns,permutation_score.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score decrease']
importance_df.sort_values(by="score decrease", ascending = False) # Order by importance

In [None]:
# splitting train and test set (70:30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# oversampling using Synthetic Minority Over-sampling Technique (SMOTE)
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [None]:
# merge dataframes
df1.merge(df2, how='inner', on=None)

In [None]:
# regular expression
xx = "guru99,education is fun"
r1 = re.findall(r"^\w+",xx)
print(r1)

## Modeling

In [None]:
# cross validation
results = cross_validate(estimator, X_train, y_train, cv=5, scoring='recall')
np.mean(results['test_score'])

## Model Evaluation and Tuning

In [None]:
# plot confusion matrix
plot_confusion_matrix(lr, X_test, y_test)  

In [None]:
# plot learning curves
train_sizes, train_scores, validation_scores = learning_curve(estimator = estimator,
                                                              X = X_train,
                                                              y = y_train,
                                                              cv = 5,
                                                              scoring='recall',
                                                              n_jobs=-1)
train_scores_mean = np.mean(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, label = 'Training score')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation score')
plt.ylabel('recall', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves', fontsize = 18, y = 1.03)
plt.legend()

In [None]:
# evaluate the model
estimator.fit(X_train, y_train)
y_pred = lr.predict(X_test)
recall_score(y_test, y_pred)

In [None]:
# clustering
km = KMeans(n_clusters=3)
km.fit(X_train)
km.cluster_centers_
km.labels_
km.predict(new_X)

In [None]:
# statsmodel
log_reg = sm.Logit(y_train, X_train).fit(maxiter=100)
log_reg.summary()
from sklearn.metrics import (confusion_matrix,
                           accuracy_score)

yhat = log_reg.predict(X_test)
prediction = list(map(round, yhat))
 
# confusion matrix
cm = confusion_matrix(y_test, prediction)
print ("Confusion Matrix : \n", cm)
 
# accuracy score of the model
print('Test accuracy = ', accuracy_score(y_test, prediction))

## Conclusion