# Machine Learning Model - CLustering and Logistic Regression

In [9]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [11]:
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Load data in from database

In [33]:
machine_learning_df = pd.read_csv('resources/machine_learning_df.txt')
machine_learning_df.head()

Unnamed: 0,Country_Code,Year,Population_Change,Inflation_Diff,Military_Diff,Export_Diff,Life_Diff,GDP_Diff
0,ABW,1960,0,,,,,
1,ABW,1961,0,,,,0.006275,
2,ABW,1962,0,,,,0.0056,
3,ABW,1963,0,,,,0.005162,
4,ABW,1964,0,,,,0.004881,


### Clean Data

In [34]:
df = machine_learning_df.dropna()
df

Unnamed: 0,Country_Code,Year,Population_Change,Inflation_Diff,Military_Diff,Export_Diff,Life_Diff,GDP_Diff
94,AFE,1991,0,0.052202,-0.012500,-0.013075,-0.002066,0.034620
95,AFE,1992,0,-0.015105,-0.006331,0.012312,-0.001786,-0.012690
96,AFE,1993,0,-0.030320,0.001489,0.000199,-0.001315,0.048812
97,AFE,1994,0,0.017172,-0.004847,0.010337,-0.000951,0.015282
98,AFE,1995,0,-0.025642,-0.004848,0.006704,-0.000657,0.122841
...,...,...,...,...,...,...,...,...
16750,ZWE,2015,0,-0.022332,-0.000012,-0.017700,0.019243,0.023985
16751,ZWE,2016,0,0.008873,-0.001444,0.007834,0.012766,0.029332
16752,ZWE,2017,0,0.024376,-0.001975,-0.002846,0.008591,-0.144232
16753,ZWE,2018,0,0.097249,-0.003222,0.083909,0.006298,0.030177


In [47]:
df = df.set_index(['Country_Code', 'Year'])
df2 = df.drop(["Population_Change"],axis=1)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Inflation_Diff,Military_Diff,Export_Diff,Life_Diff,GDP_Diff
Country_Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AFE,1991,0.052202,-0.012500,-0.013075,-0.002066,0.034620
AFE,1992,-0.015105,-0.006331,0.012312,-0.001786,-0.012690
AFE,1993,-0.030320,0.001489,0.000199,-0.001315,0.048812
AFE,1994,0.017172,-0.004847,0.010337,-0.000951,0.015282
AFE,1995,-0.025642,-0.004848,0.006704,-0.000657,0.122841
...,...,...,...,...,...,...
ZWE,2015,-0.022332,-0.000012,-0.017700,0.019243,0.023985
ZWE,2016,0.008873,-0.001444,0.007834,0.012766,0.029332
ZWE,2017,0.024376,-0.001975,-0.002846,0.008591,-0.144232
ZWE,2018,0.097249,-0.003222,0.083909,0.006298,0.030177


### Standardize Data

In [48]:
# Standardize the data with StandardScaler().
df_scaled = StandardScaler().fit_transform(df2)
df_scaled[:5]

array([[ 3.03512135e-02, -8.31397934e-01, -3.66924825e-01,
        -1.07257835e+00, -3.18841837e-01],
       [ 1.07261533e-04, -4.12958773e-01,  2.50658736e-01,
        -1.03047538e+00, -6.55730205e-01],
       [-6.72936075e-03,  1.17427117e-01, -4.40203536e-02,
        -9.59910363e-01, -2.17788546e-01],
       [ 1.46105119e-02, -3.12303609e-01,  2.02610379e-01,
        -9.05245587e-01, -4.56543553e-01],
       [-4.62764850e-03, -3.12419311e-01,  1.14223234e-01,
        -8.61126420e-01,  3.09355403e-01]])

### Reducing Data Dimensions Using PCA

In [49]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3, random_state=0)
df_pca = pca.fit_transform(df_scaled)
df_pca

array([[-0.13770472,  0.75613406, -0.25066158],
       [-0.29202894,  1.14647817, -0.28346117],
       [-0.52643619,  0.56747441, -0.17282176],
       ...,
       [-0.60792261,  0.48829167, -0.08291465],
       [ 0.91309716,  1.27175008, -0.23226969],
       [ 0.53223452,  0.35810495, -1.172963  ]])

In [50]:
# Create a DataFrame with the three principal components.
pca_df=pd.DataFrame(
    data=df_pca, columns=['PC 1', 'PC 2', 'PC 3'], index=df2.index
)
print(pca_df.shape)
pca_df.head(n=10)

(7278, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,PC 1,PC 2,PC 3
Country_Code,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AFE,1991,-0.137705,0.756134,-0.250662
AFE,1992,-0.292029,1.146478,-0.283461
AFE,1993,-0.526436,0.567474,-0.172822
AFE,1994,-0.236246,0.918968,-0.247429
AFE,1995,0.126235,0.459927,-0.135747
AFE,1996,-0.338923,0.920717,-0.222109
AFE,1997,-0.448108,0.325038,-0.127266
AFE,1998,-0.699615,0.763383,-0.183387
AFE,1999,-0.678758,0.385175,-0.113957
AFE,2000,0.16429,0.164054,-0.04167


### Clustering Using K-Means

In [51]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))

# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_df)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [70]:
# Initialize the K-Means model.
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(pca_df)

# Predict clusters
predictions = model.predict(pca_df)
len(predictions)

7278

In [71]:
# Create a new DataFrame including predicted clusters and country features.
# Concatentate the df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([df, pca_df], axis=1)

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df['Class'] = predictions

# Reset Index
clustered_df = clustered_df.reset_index()

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(7278, 12)


Unnamed: 0,Country_Code,Year,Population_Change,Inflation_Diff,Military_Diff,Export_Diff,Life_Diff,GDP_Diff,PC 1,PC 2,PC 3,Class
0,AFE,1991,0,0.052202,-0.0125,-0.013075,-0.002066,0.03462,-0.137705,0.756134,-0.250662,0
1,AFE,1992,0,-0.015105,-0.006331,0.012312,-0.001786,-0.01269,-0.292029,1.146478,-0.283461,0
2,AFE,1993,0,-0.03032,0.001489,0.000199,-0.001315,0.048812,-0.526436,0.567474,-0.172822,0
3,AFE,1994,0,0.017172,-0.004847,0.010337,-0.000951,0.015282,-0.236246,0.918968,-0.247429,0
4,AFE,1995,0,-0.025642,-0.004848,0.006704,-0.000657,0.122841,0.126235,0.459927,-0.135747,0
5,AFE,1996,0,-0.02582,-0.003064,0.01158,-0.000243,-0.004546,-0.338923,0.920717,-0.222109,0
6,AFE,1997,0,0.00543,-8.6e-05,-0.008111,0.000481,0.051304,-0.448108,0.325038,-0.127266,0
7,AFE,1998,0,-0.027543,5.3e-05,0.000896,0.001515,-0.058024,-0.699615,0.763383,-0.183387,0
8,AFE,1999,0,0.003246,0.004337,-0.000723,0.00288,-0.013693,-0.678758,0.385175,-0.113957,0
9,AFE,2000,0,0.007816,-0.002689,0.008337,0.004544,0.082976,0.16429,0.164054,-0.04167,0


### Visualizing Cluster Results

In [72]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    width=800,
    hover_name="Country_Code",
    hover_data=["Year"],
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [98]:
# Export to Excel to review the classes
clustered_df.to_excel('resources/Cluster_Review_Pull.xlsx', sheet_name='Raw_data')

 ### Separate the Dataset by Class and then into Features (X) from the Target (y)

In [104]:
df_class0 = clustered_df[clustered_df["Class"]==0]

y0 = df_class0['Population_Change']
X0 = df_class0.drop(columns=['Population_Change', 'Country_Code', 'Year', 'Class'])

# Check the balance of our target values
y0.value_counts()

0    2945
1    1449
Name: Population_Change, dtype: int64

In [105]:
df_class1 = clustered_df[clustered_df["Class"]==1]

y1 = df_class1['Population_Change']
X1 = df_class1.drop(columns=['Population_Change', 'Country_Code', 'Year', 'Class'])

# Check the balance of our target values
y1.value_counts()

0    1538
1    1342
Name: Population_Change, dtype: int64

In [101]:
df_class2 = clustered_df[clustered_df["Class"]==2]

y2 = df_class2['Population_Change']
X2 = df_class2.drop(columns=['Population_Change', 'Country_Code', 'Year', 'Class'])

# Check the balance of our target values
y2.value_counts()

1    1
Name: Population_Change, dtype: int64

In [102]:
df_class3 = clustered_df[clustered_df["Class"]==3]

y3 = df_class3['Population_Change']
X3 = df_class3.drop(columns=['Population_Change', 'Country_Code', 'Year', 'Class'])

# Check the balance of our target values
y3.value_counts()

1    1
0    1
Name: Population_Change, dtype: int64

In [103]:
df_class4 = clustered_df[clustered_df["Class"]==4]

y4 = df_class4['Population_Change']
X4 = df_class4.drop(columns=['Population_Change', 'Country_Code', 'Year', 'Class'])

# Check the balance of our target values
y4.value_counts()

1    1
Name: Population_Change, dtype: int64

 ### Split our data into training and testing for Class 0 and 1

In [106]:
from sklearn.model_selection import train_test_split

X_train0, X_test0, y_train0, y_test0 = train_test_split(X0, 
                                                    y0, 
                                                    random_state=1, 
                                                    stratify=y0)
X_train0.shape

(3295, 8)

In [107]:
from sklearn.model_selection import train_test_split

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, 
                                                    y1, 
                                                    random_state=1, 
                                                    stratify=y1)
X_train1.shape

(2160, 8)

 ## Create a Logistic Regression Model

In [108]:
classifier0 = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [109]:
classifier1 = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

 ### Fit (train) or model using the training data

In [110]:
classifier0.fit(X_train0, y_train0)

LogisticRegression(max_iter=200, random_state=1)

In [111]:
classifier1.fit(X_train1, y_train1)

LogisticRegression(max_iter=200, random_state=1)

 ### Make predictions

In [112]:
y_pred0 = classifier0.predict(X_test0)
results0 = pd.DataFrame({"Prediction": y_pred0, "Actual": y_test0}).reset_index(drop=True)
results0

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
1094,0,0
1095,0,0
1096,0,0
1097,0,0


In [113]:
y_pred1 = classifier1.predict(X_test1)
results1 = pd.DataFrame({"Prediction": y_pred1, "Actual": y_test1}).reset_index(drop=True)
results1

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,1,0
4,0,1
...,...,...
715,0,1
716,0,0
717,0,1
718,0,1


### Measure Outcomes for Class 0

In [114]:
# Calculated the accuracy score
print(accuracy_score(y_test0, y_pred0))

0.6706096451319381


In [115]:
# Display the confusion matrix
matrix0 = confusion_matrix(y_test0, y_pred0)
print(matrix0)

[[737   0]
 [362   0]]


In [116]:
# Print the classification report
report0 = classification_report(y_test0, y_pred0)
print(report0)

# Print the imbalanced classification report
print(classification_report_imbalanced(y_test0, y_pred0))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80       737
           1       0.00      0.00      0.00       362

    accuracy                           0.67      1099
   macro avg       0.34      0.50      0.40      1099
weighted avg       0.45      0.67      0.54      1099

                   pre       rec       spe        f1       geo       iba       sup

          0       0.67      1.00      0.00      0.80      0.00      0.00       737
          1       0.00      0.00      1.00      0.00      0.00      0.00       362

avg / total       0.45      0.67      0.33      0.54      0.00      0.00      1099




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### Measure Outcomes for Class 1

In [117]:
# Calculated the accuracy score
print(accuracy_score(y_test1, y_pred1))

0.5888888888888889


In [118]:
# Display the confusion matrix
matrix1 = confusion_matrix(y_test1, y_pred1)
print(matrix1)

[[310  74]
 [222 114]]


In [119]:
# Print the classification report
report1 = classification_report(y_test1, y_pred1)
print(report1)

# Print the imbalanced classification report
print(classification_report_imbalanced(y_test1, y_pred1))

              precision    recall  f1-score   support

           0       0.58      0.81      0.68       384
           1       0.61      0.34      0.44       336

    accuracy                           0.59       720
   macro avg       0.59      0.57      0.56       720
weighted avg       0.59      0.59      0.56       720

                   pre       rec       spe        f1       geo       iba       sup

          0       0.58      0.81      0.34      0.68      0.52      0.29       384
          1       0.61      0.34      0.81      0.44      0.52      0.26       336

avg / total       0.59      0.59      0.56      0.56      0.52      0.27       720

