# Machine Learning Model - Logistic Regression

In [15]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

### Load data in from database

In [17]:
machine_learning_df = pd.read_csv('resources/machine_learning_df.txt')
machine_learning_df.head()

Unnamed: 0,Country_Code,Year,Population_Change,Inflation_Diff,Military_Diff,Export_Diff,Life_Diff,GDP_Diff
0,ABW,1960,0,,,,,
1,ABW,1961,0,,,,0.006275,
2,ABW,1962,0,,,,0.0056,
3,ABW,1963,0,,,,0.005162,
4,ABW,1964,0,,,,0.004881,


### Clean Data (remove N/A)

In [18]:
df = machine_learning_df.dropna()
df

Unnamed: 0,Country_Code,Year,Population_Change,Inflation_Diff,Military_Diff,Export_Diff,Life_Diff,GDP_Diff
94,AFE,1991,0,0.052202,-0.012500,-0.013075,-0.002066,0.034620
95,AFE,1992,0,-0.015105,-0.006331,0.012312,-0.001786,-0.012690
96,AFE,1993,0,-0.030320,0.001489,0.000199,-0.001315,0.048812
97,AFE,1994,0,0.017172,-0.004847,0.010337,-0.000951,0.015282
98,AFE,1995,0,-0.025642,-0.004848,0.006704,-0.000657,0.122841
...,...,...,...,...,...,...,...,...
16750,ZWE,2015,0,-0.022332,-0.000012,-0.017700,0.019243,0.023985
16751,ZWE,2016,0,0.008873,-0.001444,0.007834,0.012766,0.029332
16752,ZWE,2017,0,0.024376,-0.001975,-0.002846,0.008591,-0.144232
16753,ZWE,2018,0,0.097249,-0.003222,0.083909,0.006298,0.030177


 ### Separate the Features (X) from the Target (y)

In [19]:
y = df['Population_Change']
X = df.drop(columns=['Population_Change', 'Country_Code', 'Year'])

In [20]:
# Check the balance of our target values
y.value_counts()

0    4484
1    2794
Name: Population_Change, dtype: int64

 ### Split our data into training and testing

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(5458, 5)

 ## Create a Logistic Regression Model

In [7]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

 ### Fit (train) or model using the training data

In [8]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

 ### Make predictions and measure outcomes

In [9]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results

Unnamed: 0,Prediction,Actual
0,0,1
1,0,0
2,0,1
3,0,0
4,0,0
...,...,...
1815,0,0
1816,0,0
1817,0,0
1818,0,0


In [12]:
# Calculated the accuracy score
print(accuracy_score(y_test, y_pred))

0.6142857142857143


In [13]:
# Display the confusion matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1108   13]
 [ 689   10]]


In [14]:
# Print the classification report
report = classification_report(y_test, y_pred)
print(report)

# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.99      0.76      1121
           1       0.43      0.01      0.03       699

    accuracy                           0.61      1820
   macro avg       0.53      0.50      0.39      1820
weighted avg       0.55      0.61      0.48      1820

                   pre       rec       spe        f1       geo       iba       sup

          0       0.62      0.99      0.01      0.76      0.12      0.02      1121
          1       0.43      0.01      0.99      0.03      0.12      0.01       699

avg / total       0.55      0.61      0.39      0.48      0.12      0.01      1820

