### Behaviour of Logistic Regression and SVM if the dataset is not standardized

In [1]:
import numpy as np
import pandas as pd
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
data = pd.read_csv('/home/ubuntu/drive_kodakandlasrikar99/aaic/8_LinearModels/task_b.csv')

In [3]:
data=data.iloc[:,1:]

In [4]:
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [5]:
data.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

#### by seeing the correlation we can say that , f3 is more important and f2,f1 are not important 

In [6]:
data.std()

f1      488.195035
f2    10403.417325
f3        2.926662
y         0.501255
dtype: float64

In [7]:
X=data[['f1','f2','f3']].values
Y=data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


In [8]:
X=data[['f1','f2','f3']].to_numpy()

In [9]:
y=data['y'].to_numpy()

### Applying SGD with log loss with out standardization of data

In [10]:
model=SGDClassifier(loss='log')
model.fit((X),y)

SGDClassifier(loss='log')

In [11]:
print(f"Feature Importance : {model.coef_}")
print("accuracy : {} ".format(model.score(X,Y)))

Feature Importance : [[-2026.11468659  6709.78680389 10447.10285327]]
accuracy : 0.475 


#### if the features are not standardized before giving them to this model , then model weights may be proportional to the varience of the respective features .so model won't perform well . so we should standardize the data before giving data to the model

### Applying SGD with Hinge loss with out standardization of data

In [12]:
model=SGDClassifier(loss='hinge')
model.fit((X),y)

SGDClassifier()

In [13]:
print(f"Feature Importance : {model.coef_}")
print("accuracy : {} ".format(model.score(X,Y)))

Feature Importance : [[13260.46161196 20304.16541758  9398.73417681]]
accuracy : 0.47 


#### if the features are not standardized before giving them to this model , then model weights may be proportional to the varience of the respective features .so model won't perform well . so we should standardize the data before giving data to the model

In [14]:
scale=StandardScaler()
X=scale.fit_transform(X)

### Applying SGD with Hinge loss after standardization of data

In [15]:
model=SGDClassifier(loss='hinge')
model.fit(scale.fit_transform(X),y)

SGDClassifier()

In [16]:
print(f"Feature Importance : {model.coef_}")
print("accuracy : {} ".format(model.score(X,Y)))

Feature Importance : [[ 3.41022844  3.57949088 22.63321333]]
accuracy : 0.905 


### because all features are at same scale, The model is able to train well and gave good results

### Applying SGD with log loss after standardization of data

In [17]:
model=SGDClassifier(loss='log')
model.fit(scale.fit_transform(X),y)

SGDClassifier(loss='log')

In [18]:
print(f"Feature Importance : {model.coef_}")
print("accuracy : {} ".format(model.score(X,Y)))

Feature Importance : [[-3.02752538 -0.89763367 11.77952057]]
accuracy : 0.92 


### because all features are at same scale, The model is able to train well and gave good results