In [None]:
# Unsupervised Learning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.pipeline import Pipeline
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage
import kaleido
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import AgglomerativeClustering
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [None]:
df_UL = pd.read_csv('Country-data_Unsupervised.csv')
df_UL.head()

In [None]:
# Data description
df_UL.info()

In [None]:
df_UL.describe()

In [None]:
corr = df_UL.corr()
# Fill diagonal and upper half with NaNs
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(color='#f1f1f1')  # Color NaNs grey
 .format(precision=2))

In [None]:
# divide features into numerical and categorical
cols = list(df_UL.columns)
cols.remove('country')
cat_feats = ['country']
num_feats = [*cols]

In [None]:
# plot distribution of features
fig, ax = plt.subplots(nrows = 3,ncols = 3,figsize = (15,15))
for i in range(len(num_feats)):
    plt.subplot(3,3,i+1)
    sns.distplot(df_UL[num_feats[i]],color = 'red')
    title = 'Distribution : ' + num_feats[i]
    plt.title(title)
plt.show();

In [None]:
# scale features
df_scal = pd.DataFrame()
df_scal['Health'] = (df_UL['child_mort'] / df_UL['child_mort'].mean()) + (df_UL['health'] / df_UL['health'].mean()) + (df_UL['life_expec'] / df_UL['life_expec'].mean()) + (df_UL['total_fer'] / df_UL['total_fer'].mean())
df_scal['Trade'] = (df_UL['imports'] / df_UL['imports'].mean()) + (df_UL['exports'] / df_UL['exports'].mean())
df_scal['Finance'] = (df_UL['income'] / df_UL['income'].mean()) + (df_UL['inflation'] / df_UL['inflation'].mean()) + (df_UL['gdpp'] / df_UL['gdpp'].mean())
df_scal.head()

In [None]:
# MinMax Scaling

mms = MinMaxScaler() # Normalization
ss = StandardScaler() # Standardization
df_scal['Health'] = mms.fit_transform(df_scal[['Health']])
df_scal['Trade'] = mms.fit_transform(df_scal[['Trade']])
df_scal['Finance'] = mms.fit_transform(df_scal[['Finance']])
df_scal.insert(loc = 0, value = list(df_UL['country']), column = 'Country')
df_scal.head()

In [None]:
# PCA
df_PCA = df_UL.copy(deep = True)

col = list(df_UL.columns)
col.remove('health'); col.remove('country')

df_PCA['health'] = ss.fit_transform(df_PCA[['health']]) # Standardization

for i in col:
    df_PCA[i] = mms.fit_transform(df_PCA[[i]]) # Normalization
df_PCA.drop(columns = 'country',inplace = True)
df_PCA.head()

In [None]:
# PCA further

pca = PCA()
df_PCA_2 = pd.DataFrame(pca.fit_transform(df_PCA))
pca.explained_variance_

In [None]:
# plot PCA
plt.step(list(range(1,10)), np.cumsum(pca.explained_variance_ratio_))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Eigen Values')
plt.ylabel('Ratio of Variance Explained')
plt.title('Variance Covered by each Eigen Value')
plt.show()

In [None]:
# combination features
m1 = df_scal.drop(columns = ['Country']).values
m2 = df_PCA_2.values

In [None]:

sse = {};sil = [];kmax = 10
fig = plt.subplots(nrows = 1, ncols = 2, figsize = (20,5))

# Elbow Method :
plt.subplot(1,2,1)
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(m2)
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
sns.lineplot(x = list(sse.keys()), y = list(sse.values()));
plt.title('Elbow Method')
plt.xlabel("k : Number of cluster")
plt.ylabel("Sum of Squared Error")
plt.grid()

# Silhouette Score Method
plt.subplot(1,2,2)
for k in range(2, kmax + 1):
    kmeans = KMeans(n_clusters = k).fit(m2)
    labels = kmeans.labels_
    sil.append(silhouette_score(m2, labels, metric = 'euclidean'))
sns.lineplot(x = range(2,kmax + 1), y = sil);
plt.title('Silhouette Score Method')
plt.xlabel("k : Number of cluster")
plt.ylabel("Silhouette Score")
plt.grid()

plt.show()


In [None]:



model = KMeans(n_clusters = 3,max_iter = 1000)
model.fit(m2)
cluster = model.cluster_centers_
centroids = np.array(cluster)
labels = model.labels_
df_UL['Class'] = labels; df_PCA_2['Class'] = labels

In [None]:

df_PCA_2.insert(0,column = 'Country', value = df_UL['country'])

df_PCA_2['Class'].loc[df_PCA_2['Class'] == 0] = 'Might Need Help'
df_PCA_2['Class'].loc[df_PCA_2['Class'] == 1] = 'Help Needed'
df_PCA_2['Class'].loc[df_PCA_2['Class'] == 2] = 'No Help Needed'

fig = px.choropleth(df_PCA_2[['Country','Class']],
                    locationmode = 'country names',
                    locations = 'Country',
                    title = 'Needed Help Per Country (World)',
                    color = df_PCA_2['Class'],
                    color_discrete_map = {'Help Needed':'#d62728',
                                          'Might Need Help':'#bcbd22',
                                          'No Help Needed': '#1f77b4'})
fig.update_geos(fitbounds = "locations", visible = True)
fig.update_layout(legend_title_text = 'Labels',legend_title_side = 'top',title_pad_l = 260,title_y = 0.86)
fig.show(engine = 'kaleido')

In [None]:
# hierarchical clustering

linkage_data = linkage(m2, method = 'ward', metric = 'euclidean')
dendrogram(linkage_data)
plt.tight_layout()
plt.show()

In [None]:

hierarchical_cluster = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage = 'ward')
labels = hierarchical_cluster.fit(m2)

pred_agc = pd.Series(hierarchical_cluster.labels_)
df_UL['Class'] = pred_agc; df_PCA_2['Class'] = pred_agc

In [None]:
df_PCA_2['Class'].loc[df_PCA_2['Class'] == 0] = 'Help Needed'
df_PCA_2['Class'].loc[df_PCA_2['Class'] == 1] = 'Might Need Help'
df_PCA_2['Class'].loc[df_PCA_2['Class'] == 2] = 'No Help Needed'

fig = px.choropleth(df_PCA_2[['Country','Class']],
                    locationmode = 'country names',
                    locations = 'Country',
                    title = 'Needed Help Per Country (World)',
                    color = df_PCA_2['Class'],
                    color_discrete_map={'Help Needed':'Red',
                                        'Might Need Help':'Yellow',
                                        'No Help Needed':'Green'})
fig.update_geos(fitbounds = "locations", visible = True)
fig.update_layout(legend_title_text = 'Labels',legend_title_side = 'top',title_pad_l = 260,title_y = 0.86)
fig.show(engine = 'kaleido')

In [None]:
# regression

In [None]:
df_reg = pd.read_csv('auto-mpg-regression.csv', na_values = "?")
df_reg.head()

In [None]:
df_reg.describe()

In [None]:
def auto_preprocess(dataframe):
    df_ = dataframe.copy()
    auto_misspelled = {'chevroelt': 'chevrolet',
                       'chevy': 'chevrolet',
                       'vokswagen': 'volkswagen',
                       'vw': 'volkswagen',
                       'hi': 'harvester',
                       'maxda': 'mazda',
                       'toyouta': 'toyota',
                       'mercedes-benz': 'mercedes'}
    df_['make'] = [auto_misspelled[key].title() if key in auto_misspelled else
                   key.title() for key in [i.split()[0] for i in df_['car name']]]
    df_['name'] = [' '.join(i.split()[1:]).title() for i in df_['car name']]

    df_ = df_.drop(columns = ['car name'], axis = 1)
    return df_

df_reg = auto_preprocess(df_reg)
df_reg.head()

In [None]:
# types of variables

def check_class(dataframe):
    nunique_df = pd.DataFrame({'Variable': dataframe.columns,
                               'Classes': [dataframe[i].nunique() \
                                           for i in dataframe.columns]})

    nunique_df = nunique_df.sort_values('Classes', ascending=False)
    nunique_df = nunique_df.reset_index(drop = True)
    return nunique_df

check_class(df_reg)

In [None]:
corr = df_reg.corr()
# Fill diagonal and upper half with NaNs
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(color='#f1f1f1')  # Color NaNs grey
 .format(precision=2))

In [None]:
# get rid of missing
cat_cols = ['cylinders', 'origin']
df_reg['horsepower'] = df_reg['horsepower'].fillna(df_reg.groupby(cat_cols)['horsepower'].transform('median'))

In [None]:
# mpg explore

fig = plt.figure(figsize = (8,6))
stats.probplot(df_reg["mpg"], plot = plt)
plt.title("Before Log1p Transformation", size = 12)
plt.show()

In [None]:
#log`p
df_reg["mpg"] = np.log1p(df_reg["mpg"])

In [None]:
# encoding

def one_hot_encoder(dataframe, categorical_cols: list, drop_first: bool = False):
    dataframe = pd.get_dummies(dataframe,
                               columns = categorical_cols,
                               drop_first = drop_first)
    return dataframe

def label_encoder(dataframe, binary_col):
    labelencoder = LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe

binary_cols = [col for col in df_reg.columns if df_reg[col].dtype not in [int, float]
               and df_reg[col].nunique() == 2]
print('Binary Features: {}'.format(binary_cols))

ohe_cols = [col for col in df_reg.columns if 10 >= df_reg[col].nunique() > 2]
ohe_cols.append('make')
print('Multiclass Features: {}'.format(ohe_cols))

In [None]:
df_reg['cylinders'] = df_reg['cylinders'].astype(int)
df_reg['origin'] = df_reg['origin'].astype(int)
df_reg = one_hot_encoder(df_reg, ohe_cols)
df_reg.head()

In [None]:
useless_cols = useless_cols = [col for col in df_reg.columns if df_reg[col].nunique() == 2 and
                (df_reg[col].value_counts() / len(df_reg) < 0.03).any(axis=None)]

print('Number of useless variables: {}'.format(len(useless_cols)))
df_reg.drop(useless_cols, axis = 1, inplace=True)

In [None]:
#split
X = df_reg.drop(columns = ["mpg", 'name'], axis = 1)

y = df_reg['mpg']

X.head()

In [None]:

test_size = 0.2

random_state = 154

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size  = test_size,
                                                    random_state = random_state)

In [None]:


# train
def train_pipeline(pipe):
    result = dict()
    scaler = pipe.steps[0][1].__class__.__name__
    regressor = pipe.steps[1][1].__class__.__name__
    result['model'] = regressor
    result['scaler'] = scaler if scaler != 'NoneType' else 'Without Scaling'

    #Training Model
    pipe.fit(X_train, y_train)

    #Get Predictions
    y_pred = pipe.predict(X_test)
    y_test_exp = np.expm1(y_test)
    y_pred_exp = np.expm1(y_pred)

    #Model Evaluation
    result['r2'] =  r2_score(y_test_exp, y_pred_exp),
    result['mse'] =  mean_squared_error(y_test_exp, y_pred_exp)
    return result

scalers = [None, RobustScaler(), MinMaxScaler(), StandardScaler()]

regressors = [LinearRegression(),
              Lasso(), Ridge(),
              RandomForestRegressor(), DecisionTreeRegressor(), GradientBoostingRegressor()]

eval_data = pd.DataFrame()
for reg in regressors:
    for sc in scalers:
        pipeline = Pipeline([('scaler', sc), ('reg', reg)])
        eval_data = eval_data.append(pd.DataFrame(train_pipeline(pipeline)))
    eval_data = eval_data.reset_index(drop = True)
eval_data

In [None]:
eval_data.transpose()

In [None]:
# Supervised task
df_SL = pd.read_csv('heart supervised.csv')

In [None]:
df_SL.describe()

In [None]:
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]
target_col = ["output"]

In [None]:
df_SL.isnull().sum()

In [None]:
corr = df_SL.corr()
# Fill diagonal and upper half with NaNs
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
(corr
 .style
 .background_gradient(cmap='coolwarm', axis=None, vmin=-1, vmax=1)
 .highlight_null(color='#f1f1f1')  # Color NaNs grey
 .format(precision=2))

In [None]:
# creating a copy of df
df1 = df_SL

# define the columns to be encoded and scaled
cat_cols = ['sex','exng','caa','cp','fbs','restecg','slp','thall']
con_cols = ["age","trtbps","chol","thalachh","oldpeak"]

# encoding the categorical columns
df1 = pd.get_dummies(df1, columns = cat_cols, drop_first = True)

# defining the features and target
X = df1.drop(['output'],axis=1)
y = df1[['output']]

# instantiating the scaler
scaler = RobustScaler()

# scaling the continuous featuree
X[con_cols] = scaler.fit_transform(X[con_cols])
print("The first 5 rows of X are")
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 154)

In [None]:
models_supervised = {}

In [None]:

# instantiating the object and fitting
clf = SVC(kernel='linear', C=1, random_state=154).fit(X_train,y_train)

# predicting the values
y_pred = clf.predict(X_test)

models_supervised['SVM'] = f1_score(y_test, y_pred)

In [None]:
# instantiating the object
svm = SVC()

# setting a grid - not so extensive
parameters = {"C":np.arange(1,10,1),'gamma':[0.00001,0.00005, 0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5,1,5]}

# instantiating the GridSearchCV object
searcher = GridSearchCV(svm, parameters)

# fitting the object
searcher.fit(X_train, y_train)

# predicting the values
y_pred = searcher.predict(X_test)

models_supervised['SVM (tuned)'] = f1_score(y_test, y_pred)

In [None]:
#logistic
# instantiating the object
logreg = LogisticRegression()

# fitting the object
logreg.fit(X_train, y_train)

# calculating the probabilities
y_pred_proba = logreg.predict_proba(X_test)

# finding the predicted valued
y_pred = np.argmax(y_pred_proba,axis=1)

models_supervised['Logistic Regression'] = f1_score(y_test, y_pred)

In [None]:
# calculating the probabilities
y_pred_prob = logreg.predict_proba(X_test)[:,1]

# instantiating the roc_cruve
fpr,tpr,threshols=roc_curve(y_test,y_pred_prob)

# plotting the curve
plt.plot([0,1],[0,1],"k--",'r+')
plt.plot(fpr,tpr,label='Logistic Regression')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Logistric Regression ROC Curve")
plt.show()

In [None]:
# instantiating the object
dt = DecisionTreeClassifier(random_state = 154)

# fitting the model
dt.fit(X_train, y_train)

# calculating the predictions
y_pred = dt.predict(X_test)

models_supervised['Decision Tree'] = f1_score(y_test, y_pred)

In [None]:
# instantiating the object
rf = RandomForestClassifier()

# fitting the model
rf.fit(X_train, y_train)

# calculating the predictions
y_pred = dt.predict(X_test)


models_supervised['Random Forest'] = f1_score(y_test, y_pred)

In [None]:
# instantiate the classifier
gbt = GradientBoostingClassifier(n_estimators = 300,max_depth=1,subsample=0.8,max_features=0.2,random_state=154)

# fitting the model
gbt.fit(X_train,y_train)

# predicting values
y_pred = gbt.predict(X_test)

models_supervised['Gradient Boosting'] = f1_score(y_test, y_pred)

In [None]:
df_sup_mods = pd.DataFrame(models_supervised, index=['F1'])
df_sup_mods.transpose()