# Interfacing between Pandas and Model code:

In [57]:
import pandas as pd
import numpy as np


data = pd.DataFrame({"x0" : [1,2,3,4,5],
                     "x1" : [0.01, -0.01, 0.25, -4.1, 0.],
                     "x2" : [-1.5, 0., 3.6, 1.3, -2.]})
data

Unnamed: 0,x0,x1,x2
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [58]:
data.columns

Index(['x0', 'x1', 'x2'], dtype='object')

In [59]:
data.values

array([[ 1.  ,  0.01, -1.5 ],
       [ 2.  , -0.01,  0.  ],
       [ 3.  ,  0.25,  3.6 ],
       [ 4.  , -4.1 ,  1.3 ],
       [ 5.  ,  0.  , -2.  ]])

In [60]:
df2 = pd.DataFrame(data.values, columns = ['x0', 'x1', 'x2'])
df2

Unnamed: 0,x0,x1,x2
0,1.0,0.01,-1.5
1,2.0,-0.01,0.0
2,3.0,0.25,3.6
3,4.0,-4.1,1.3
4,5.0,0.0,-2.0


.values attribute is tend to be used when the data is homogenous- for ex all numeric types. 
If we have hetrogeneous data, the result will be an ndarray of python objects.

In [61]:
df3 = data.copy()

df3["strings"] = ["a", "b", "c","d","e"]
df3

Unnamed: 0,x0,x1,x2,strings
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,c
3,4,-4.1,1.3,d
4,5,0.0,-2.0,e


In [62]:
df3.values

array([[1, 0.01, -1.5, 'a'],
       [2, -0.01, 0.0, 'b'],
       [3, 0.25, 3.6, 'c'],
       [4, -4.1, 1.3, 'd'],
       [5, 0.0, -2.0, 'e']], dtype=object)

In [63]:
#here we wish have only a subset of the columns. Here we can use loc indexing with values

data.loc[:,["x0","x1","x2"]] #without the col["strings"]

Unnamed: 0,x0,x1,x2
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [64]:
data["category"] = pd.Categorical(["a", "b", "a", "a", "b"], categories = ["a", "b"])
data

Unnamed: 0,x0,x1,x2,category
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,a
3,4,-4.1,1.3,a
4,5,0.0,-2.0,b


In [65]:
dummies = pd.get_dummies(data.category, prefix = "category")
dummies

Unnamed: 0,category_a,category_b
0,1,0
1,0,1
2,1,0
3,1,0
4,0,1


In [66]:
data_with_dummies = data.drop("category", axis = 1).join(dummies)
data_with_dummies

Unnamed: 0,x0,x1,x2,category_a,category_b
0,1,0.01,-1.5,1,0
1,2,-0.01,0.0,0,1
2,3,0.25,3.6,1,0
3,4,-4.1,1.3,1,0
4,5,0.0,-2.0,0,1


In [67]:
data

Unnamed: 0,x0,x1,x2,category
0,1,0.01,-1.5,a
1,2,-0.01,0.0,b
2,3,0.25,3.6,a
3,4,-4.1,1.3,a
4,5,0.0,-2.0,b


# Introduction to statsmodels

In [68]:
# Linear models are of many kinds basic (ordinary least squares) to more complex (iteratively reweighted least squares)
# Linear models in statsmodels have two different main interfaces : array-based and formula-based
#These are accessed by API module imports:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [69]:
#we generate a linear model with some random data

def dnorm(mean, variance, size =1):
    if isinstance(size, int):
        size = size,
    return mean + np.sqrt(variance) * np.random.randn(*size)

In [70]:
# for reproducibility
np.random.seed(12345)

In [71]:
N = 100
X = np.c_[dnorm(0, 0.4, size = N),
          dnorm(0, 0.6, size = N),
          dnorm(0,0.2, size = N)]

eps = dnorm(0, 0.1, size = N)
beta = [0.1, 0.3,0.5]

y = np.dot(X, beta) + eps
X

array([[-1.29468492e-01, -1.21275292e+00,  5.04224878e-01],
       [ 3.02910364e-01, -4.35741756e-01, -2.54179861e-01],
       [-3.28521889e-01, -2.53015334e-02,  1.38350968e-01],
       [-3.51474705e-01, -7.19605110e-01, -2.58214633e-01],
       [ 1.24326880e+00, -3.73799164e-01, -5.22629046e-01],
       [ 8.81267227e-01, -2.80898544e-02, -3.68960148e-01],
       [ 5.87601006e-02,  8.48485492e-01, -1.18261588e+00],
       [ 1.78191913e-01,  7.59823931e-01, -6.84173312e-02],
       [ 4.86372577e-01, -4.56615198e-01, -3.36269295e-01],
       [ 7.88314544e-01,  1.22517962e+00, -5.93046604e-02],
       [ 6.37002481e-01, -4.09556235e-01,  6.51724241e-01],
       [-8.19802211e-01,  3.53992127e-01,  2.72581984e-01],
       [ 1.73919980e-01,  7.20350703e-01, -2.20824797e-01],
       [ 1.44777217e-01, -1.21555178e+00,  5.54535862e-01],
       [ 8.55659737e-01, -7.92015008e-01, -6.06967863e-02],
       [ 5.60627140e-01, -3.12028394e-01,  6.39534138e-01],
       [-1.26594659e+00,  1.70788390e-01

In [72]:
y

array([ 0.42786349, -0.67348041, -0.09087764, -0.48949442, -0.12894109,
       -0.04501494,  0.08757735, -0.50456809, -0.54582359,  0.26527124,
        0.59784431,  0.45268655,  0.08698737,  0.05540612, -0.09117045,
        0.14472907, -0.15127161, -0.05633559,  1.2167688 , -0.02230032,
       -0.69063922,  0.08524475,  0.73444882, -0.35271834, -0.25469893,
        0.30780133,  0.70383282, -0.5331801 , -0.22072084, -0.09677542,
       -0.49691476, -1.33344177, -0.37685375,  1.25999316, -0.29484543,
       -0.61445479,  0.18725508, -0.40779804,  0.05730302,  0.4745453 ,
       -0.43516233,  0.03148314, -0.05635841,  0.12133475,  0.22345618,
        0.05955794,  0.25805322, -0.2750181 ,  0.30513496, -0.20032791,
        0.08627269, -0.42451706,  0.23481135, -0.32057314,  0.67561398,
       -0.38726135, -0.37863875, -0.16376385, -0.17011089,  0.39236031,
       -0.13687819,  0.18865275, -0.13990581,  0.61372834, -0.40825235,
        0.46866481, -0.59632133, -0.07708193,  0.70818684,  0.14

Here we wrote down the true model with known parameters beta. In this case, dnorm is a helper function for generating a normal distributed data with a particular mean and variance.


A linear model is genarlly fitted with an intercept term.

In [73]:
X_model = sm.add_constant(X)
X_model[:5]

array([[ 1.        , -0.12946849, -1.21275292,  0.50422488],
       [ 1.        ,  0.30291036, -0.43574176, -0.25417986],
       [ 1.        , -0.32852189, -0.02530153,  0.13835097],
       [ 1.        , -0.35147471, -0.71960511, -0.25821463],
       [ 1.        ,  1.2432688 , -0.37379916, -0.52262905]])

In [74]:
#The sm.OLS class can fit an ordinary least squares linear regression

model = sm.OLS(y,X)
model

<statsmodels.regression.linear_model.OLS at 0x2b09ae40c88>

In [75]:
results = model.fit() # The "fit" method returns regression results object containing 
                      # estimated model parameters and diagonistic output of the model


In [76]:
results.params

array([0.17826108, 0.22303962, 0.50095093])

In [77]:
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.430
Model:                            OLS   Adj. R-squared (uncentered):              0.413
Method:                 Least Squares   F-statistic:                              24.42
Date:                Mon, 29 Aug 2022   Prob (F-statistic):                    7.44e-12
Time:                        22:52:45   Log-Likelihood:                         -34.305
No. Observations:                 100   AIC:                                      74.61
Df Residuals:                      97   BIC:                                      82.42
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

Sci-Kit learn:

In [132]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [133]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [134]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [135]:
#check columns with missing data as scikit learn cannot be fed missing data
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [136]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In stats and machine learning example the typical task is to predict whether a passenger would survive based on features in the data.

# Feature Selection: 

1. Age
2. Gender
3. PClass

We will use age as a predictor but it has missing values, hence we will use missing data imputation.


In [137]:
impute_value = train["Age"].median()
train["Age"] = train["Age"].fillna(impute_value)

test["Age"] = test["Age"].fillna(impute_value)

In [138]:
#Lets add "is_female" as encoded version:

train["is_female"] = (train["Sex"]=="female").astype(int)

test["is_female"] = (test["Sex"] == "female").astype(int)

train[:50]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_female
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0
5,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,,Q,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,0
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,1


#or use label encoder

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train["Sex"] = le.fit_transform(train["Sex"])


train.head()

 See that there are also categorical values in the dataset,for this, you need to use Label Encoding or 
#One Hot Encoding. 

In [89]:
predictors = ["Pclass", "is_female", "Age"]

In [90]:
X_train = train[predictors].values
X_test = test[predictors].values

In [91]:
y_train = train["Survived"].values

In [93]:
X_train[:5]

array([[ 3.,  0., 22.],
       [ 1.,  1., 38.],
       [ 3.,  1., 26.],
       [ 1.,  1., 35.],
       [ 3.,  0., 35.]])

In [94]:
y_train[:5]

array([0, 1, 1, 1, 0], dtype=int64)

In [95]:
from sklearn.linear_model import LogisticRegression

In [96]:
model = LogisticRegression()

In [97]:
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [98]:
y_predict = model.predict(X_test)

In [99]:
y_predict[:5]

array([0, 0, 0, 0, 1], dtype=int64)

In [121]:
df_s=pd.read_csv("titanic/gender_submission.csv")
df_s

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [123]:
y_test = df_s["Survived"].values
y_test

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [124]:
len(X_test)

418

In [125]:
from sklearn import metrics

print(metrics.accuracy_score(y_predict, y_test))

0.9712918660287081


In [126]:
(y_test == y_predict).mean()

0.9712918660287081

In [141]:
#Methods like cross validation can be used to for parameter tuning to avoid overfitting

from sklearn.linear_model import LogisticRegressionCV

model_cv = LogisticRegressionCV(10)
model_cv.fit(X_train,y_train)



LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

In [143]:
from sklearn.model_selection import cross_val_score

model = LogisticRegression(C=10)

scores = cross_val_score(model, X_train, y_train, cv = 4)
scores



array([0.77232143, 0.80269058, 0.77027027, 0.78828829])