# Setting up Logistic Regression for the VGG16 extracted features

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_validate
from time import time
from Loader import train_data, test_data

In [2]:
df_real = pd.read_csv("Real_Train_Features.csv")
df_real.head(3)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,503,504,505,506,507,508,509,510,511,class
0,0,27.489777,0.0,0.0,0.0,0.0,4.406034,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.239311,5.528212,0.0,0.0,0.0,Real
1,1,0.0,0.0,0.0,27.298651,0.0,21.718538,17.454405,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,22.730452,0.0,0.0,0.0,Real
2,2,0.0,0.0,0.0,0.0,0.0,7.493225,0.0,1.344077,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Real


In [3]:
df_fake = pd.read_csv("Fake_Train_Features.csv")
df_fake.head(3)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,503,504,505,506,507,508,509,510,511,class
0,0,0.0,14.31729,0.0,29.562475,0.0,39.90519,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fake
1,1,0.0,0.0,0.0,0.0,10.574826,22.795403,0.0,0.0,0.0,...,15.529095,0.0,0.0,0.0,0.0,29.724703,0.0,0.0,18.845037,Fake
2,2,6.629388,0.0,0.0,0.0,13.008304,0.0,0.0,41.318893,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.707954,0.0,Fake


In [4]:
df = pd.concat([df_real,df_fake],axis=0)
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,503,504,505,506,507,508,509,510,511,class
0,0,27.489777,0.0,0.000000,0.000000,0.000000,4.406034,0.000000,0.000000,0.0,...,0.000000,0.00000,0.0,0.0,3.239311,5.528212,0.0,0.000000,0.000000,Real
1,1,0.000000,0.0,0.000000,27.298651,0.000000,21.718538,17.454405,0.000000,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,22.730452,0.0,0.000000,0.000000,Real
2,2,0.000000,0.0,0.000000,0.000000,0.000000,7.493225,0.000000,1.344077,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,Real
3,3,0.000000,0.0,29.934275,39.067974,0.000000,24.231058,6.911937,0.000000,0.0,...,16.403696,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.000000,17.574917,Real
4,4,0.000000,0.0,0.000000,24.634607,26.340021,0.000000,0.000000,0.000000,0.0,...,13.876422,0.00000,0.0,0.0,0.000000,28.267179,0.0,0.000000,0.000000,Real
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,0.000000,0.0,0.000000,4.518451,0.723764,17.608213,0.000000,0.000000,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,Fake
49996,49996,0.000000,0.0,0.000000,0.000000,10.060492,0.000000,0.000000,0.000000,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,1.049656,0.000000,Fake
49997,49997,3.222649,0.0,0.000000,0.000000,22.387257,53.118912,0.000000,0.000000,0.0,...,0.000000,6.03152,0.0,0.0,0.000000,0.000000,0.0,0.000000,12.272790,Fake
49998,49998,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,11.401400,0.000000,0.0,...,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.000000,3.337973,Fake


In [5]:
df = df.iloc[:,1:]
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,class
0,27.489777,0.0,0.000000,0.000000,0.000000,4.406034,0.000000,0.000000,0.0,9.508448,...,0.000000,0.00000,0.0,0.0,3.239311,5.528212,0.0,0.000000,0.000000,Real
1,0.000000,0.0,0.000000,27.298651,0.000000,21.718538,17.454405,0.000000,0.0,14.870205,...,0.000000,0.00000,0.0,0.0,0.000000,22.730452,0.0,0.000000,0.000000,Real
2,0.000000,0.0,0.000000,0.000000,0.000000,7.493225,0.000000,1.344077,0.0,0.000000,...,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,Real
3,0.000000,0.0,29.934275,39.067974,0.000000,24.231058,6.911937,0.000000,0.0,0.000000,...,16.403696,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.000000,17.574917,Real
4,0.000000,0.0,0.000000,24.634607,26.340021,0.000000,0.000000,0.000000,0.0,0.000000,...,13.876422,0.00000,0.0,0.0,0.000000,28.267179,0.0,0.000000,0.000000,Real
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,0.000000,0.0,0.000000,4.518451,0.723764,17.608213,0.000000,0.000000,0.0,12.797303,...,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,Fake
49996,0.000000,0.0,0.000000,0.000000,10.060492,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,1.049656,0.000000,Fake
49997,3.222649,0.0,0.000000,0.000000,22.387257,53.118912,0.000000,0.000000,0.0,0.000000,...,0.000000,6.03152,0.0,0.0,0.000000,0.000000,0.0,0.000000,12.272790,Fake
49998,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,11.401400,0.000000,0.0,0.000000,...,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.000000,3.337973,Fake


In [6]:
df_sample = df.sample(frac=1.0, random_state=0)

In [7]:
X = df_sample.drop('class', axis=1)
stnd = StandardScaler()
stnd.fit(X)
X = stnd.transform(X)
y = df_sample['class']

In [8]:
time_start = time()
lgr = LogisticRegression(n_jobs=-1,
                        max_iter=1000)

lgr.fit(X,y)

time_stop = time()
print('Elapsed Time for Logistic Regression:', time_stop-time_start)
print()
print('Baseline Training Accuracy:', lgr.score(X,y))

Elapsed Time for Logistic Regression: 59.52538323402405

Baseline Training Accuracy: 0.8595


In [9]:
time_start = time()
lgrcv = LogisticRegressionCV(cv=5, 
                             random_state=0, 
                             n_jobs=-1,
                             penalty='l1',
                             solver = 'saga',
                             max_iter=5000)

lgrcv.fit(X,y)
time_stop = time()
print('CV Time:', time_stop-time_start)
print()
print('CV Baseline Error:', lgrcv.score(X,y))

# Get the best validation score
val_score = []
for i in range (len(lgrcv.scores_['Real'])):
    val_score.append(lgrcv.scores_['Real'][i].mean())
    
print('CV Valid Error:', max(val_score))

CV Time: 181.8137605190277

CV Baseline Error: 0.85954
CV Valid Error: 0.82011


In [10]:
time_start = time()
lgrcv = LogisticRegressionCV(cv=5, 
                             random_state=0, 
                             n_jobs=-1,
                             penalty='l2',
                             solver = 'saga',
                             max_iter=5000)

lgrcv.fit(X,y)
time_stop = time()
print('CV Time:', time_stop-time_start)
print()
print('CV Baseline:', lgrcv.score(X,y))

# Get the best validation score
val_score = []
for i in range (len(lgrcv.scores_['Real'])):
    val_score.append(lgrcv.scores_['Real'][i].mean())
    
print('CV Valid Error:', max(val_score))

CV Time: 113.18555665016174

CV Baseline: 0.85961
CV Valid Error: 0.859525
