In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler 
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn import metrics 
#from sklearn.cross_validation import cross_val_score 
data = sm.datasets.fair.load_pandas().data


In [2]:
data.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


In [3]:
data['affairs'] = (data['affairs']>0).astype(int)

In [4]:
data.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,1


In [5]:
data

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,1
...,...,...,...,...,...,...,...,...,...
6361,5.0,32.0,13.0,2.0,3.0,17.0,4.0,3.0,0
6362,4.0,32.0,13.0,1.0,1.0,16.0,5.0,5.0,0
6363,5.0,22.0,2.5,0.0,2.0,14.0,3.0,1.0,0
6364,5.0,32.0,6.0,1.0,3.0,14.0,3.0,4.0,0


In [6]:
data['affairs'].value_counts()

0    4313
1    2053
Name: affairs, dtype: int64

In [7]:
data.isnull().sum()

rate_marriage      0
age                0
yrs_married        0
children           0
religious          0
educ               0
occupation         0
occupation_husb    0
affairs            0
dtype: int64

In [8]:
X_old = data.drop(columns=['affairs'])
scalar = StandardScaler()
X = scalar.fit_transform(X_old)
X

array([[-1.15425213e+00,  4.26024735e-01, -1.29473293e-03, ...,
         1.28115287e+00, -1.51129246e+00,  8.54069123e-01],
       [-1.15425213e+00, -3.04185400e-01,  5.48189921e-01, ...,
        -9.63641611e-02, -4.50087100e-01,  1.11309009e-01],
       [-1.14052659e-01, -1.03439554e+00, -8.94207296e-01, ...,
         8.21980524e-01, -4.50087100e-01,  8.54069123e-01],
       ...,
       [ 9.26146808e-01, -1.03439554e+00, -8.94207296e-01, ...,
        -9.63641611e-02, -4.50087100e-01, -2.11697133e+00],
       [ 9.26146808e-01,  4.26024735e-01, -4.13408224e-01, ...,
        -9.63641611e-02, -4.50087100e-01,  1.11309009e-01],
       [-1.14052659e-01, -1.03439554e+00, -8.94207296e-01, ...,
         8.21980524e-01, -1.51129246e+00,  1.11309009e-01]])

In [9]:
y = data[['affairs']]
y

Unnamed: 0,affairs
0,1
1,1
2,1
3,1
4,1
...,...
6361,0
6362,0
6363,0
6364,0


In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=144)

In [11]:
x_train

array([[-0.11405266,  0.42602474,  0.54818992, ..., -1.01470885,
         1.67232363,  0.85406912],
       [-0.11405266, -1.03439554, -0.8942073 , ..., -0.09636416,
        -0.4500871 , -0.63145111],
       [-3.23465106,  0.42602474,  1.02898899, ..., -1.01470885,
        -0.4500871 ,  0.85406912],
       ...,
       [-3.23465106, -0.3041854 , -0.41340822, ..., -0.09636416,
        -1.51129246,  1.59682924],
       [ 0.92614681, -1.03439554, -0.8942073 , ..., -0.09636416,
        -1.51129246, -0.63145111],
       [ 0.92614681,  1.15623487,  1.92190156, ..., -1.01470885,
        -0.4500871 ,  0.11130901]])

In [12]:
x_test

array([[-0.11405266, -0.3041854 , -0.8942073 , ..., -0.09636416,
         1.67232363, -1.37421122],
       [ 0.92614681, -0.3041854 , -0.8942073 , ..., -0.09636416,
        -0.4500871 ,  0.11130901],
       [-0.11405266,  1.15623487, -0.41340822, ...,  0.82198052,
         0.61111826,  0.85406912],
       ...,
       [-0.11405266, -1.03439554, -0.8942073 , ..., -1.01470885,
        -1.51129246, -1.37421122],
       [ 0.92614681, -1.03439554, -1.16894962, ...,  0.82198052,
         0.61111826, -1.37421122],
       [ 0.92614681, -1.03439554, -0.8942073 , ..., -0.09636416,
         0.61111826,  0.11130901]])

In [13]:
logr = LogisticRegression()  

In [14]:
logr.fit(x_train,y_train)

  return f(*args, **kwargs)


LogisticRegression()

In [15]:
y_pred=logr.predict_proba(x_test)

In [16]:
y_pred

array([[0.73458931, 0.26541069],
       [0.91387359, 0.08612641],
       [0.85464145, 0.14535855],
       ...,
       [0.81233235, 0.18766765],
       [0.93152819, 0.06847181],
       [0.82273428, 0.17726572]])

In [17]:
y_pred[:,0]

array([0.73458931, 0.91387359, 0.85464145, ..., 0.81233235, 0.93152819,
       0.82273428])

In [18]:
from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import r2_score

In [19]:
print(r2_score(y_test,y_pred[:,0]))

-0.9864229264454054
