## **Multinomial logistic regression**

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as st
import pingouin as pg

In [21]:
chat_up  = pd.read_csv('/home/atrides/Desktop/R/statistics_with_Python/08_LogisticRegression/Data_Files/Chat-Up Lines.dat', sep='\t')
print(chat_up.head())

               Success  Funny  Sex  Good_Mate Gender
0     Get Phone Number      3    7          6   Male
1  Go Home with Person      5    7          2   Male
2     Get Phone Number      4    6          6   Male
3  Go Home with Person      3    7          5   Male
4     Get Phone Number      5    1          6   Male


In [22]:
print(chat_up['Success'].unique())

['Get Phone Number' 'Go Home with Person' 'No response/Walk Off']


In [23]:
chat_up['Successx'] = chat_up['Success'].replace({'Get Phone Number':1, 'Go Home with Person':2, 'No response/Walk Off':0})
chat_up['Genderx'] = chat_up['Gender'].replace({'Male':0, 'Female':1})

chat_up['Gen_Funny'] = chat_up['Genderx'] * chat_up['Funny']
chat_up['Gen_Sex'] = chat_up['Genderx'] * chat_up['Sex']

In [26]:
import statsmodels.formula.api as smf
ml01 = smf.mnlogit('Successx ~ Funny + Sex + Good_Mate +Genderx+Gen_Funny+ Gen_Sex', chat_up).fit()
print(ml01.summary())

Optimization terminated successfully.
         Current function value: 0.851702
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:               Successx   No. Observations:                 1020
Model:                        MNLogit   Df Residuals:                     1006
Method:                           MLE   Df Model:                           12
Date:                Wed, 16 Sep 2020   Pseudo R-squ.:                  0.1382
Time:                        01:16:38   Log-Likelihood:                -868.74
converged:                       True   LL-Null:                       -1008.0
Covariance Type:            nonrobust   LLR p-value:                 1.496e-52
Successx=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.7831      0.670     -2.662      0.008      -3.096      -0.470
Funny          0.1394      0.

In [27]:
print(np.exp(ml01.params))

                  0         1
Intercept  0.168121  0.013755
Funny      1.149571  1.375004
Sex        1.318120  1.517832
Good_Mate  1.140926  1.138850
Genderx    0.192776  0.003601
Gen_Funny  1.636307  3.229789
Gen_Sex    0.705869  0.620866


## Checking Assumptions

### Assumptions of Multicollinearity

In [29]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [31]:
chat_up_ = chat_up.copy()
chat_up_.drop(['Success', 'Successx', 'Gender','Gen_Funny', 'Gen_Sex'], inplace=True, axis=1)
print(chat_up_.head())

   Funny  Sex  Good_Mate  Genderx
0      3    7          6        0
1      5    7          2        0
2      4    6          6        0
3      3    7          5        0
4      5    1          6        0


In [33]:
from statsmodels.tools.tools import add_constant

In [34]:
chat_up_ = add_constant(chat_up_)

In [36]:
vif = pd.Series([variance_inflation_factor(chat_up_.values, i) 
               for i in range(1, chat_up_.shape[1])], 
              index=chat_up_.columns[1:])

tolerance  = 1/vif

In [37]:
print(vif)

Funny        1.264375
Sex          1.017911
Good_Mate    1.028197
Genderx      1.218566
dtype: float64


In [38]:
print(tolerance)

Funny        0.790905
Sex          0.982404
Good_Mate    0.972576
Genderx      0.820637
dtype: float64


### also correlation value will show that there is no problem of multicollinearity.So, assumption of  multicollinearity has been followed

In [40]:
# also, correlation table seems ok
print(chat_up[['Funny', 'Sex', 'Good_Mate']].corr())

              Funny       Sex  Good_Mate
Funny      1.000000  0.115608   0.163210
Sex        0.115608  1.000000   0.037946
Good_Mate  0.163210  0.037946   1.000000


### Assumption of Linearity

In [44]:
for i in chat_up_.columns:
    if i=='const' or i=='Genderx':
        pass
    else:
        v = f'log_{i}'
        chat_up_[v] = np.log(chat_up_[i])*chat_up_[i]

In [45]:
chat_up_['Successx'] = chat_up['Successx']
chat_up_['Gen_Funny'] = chat_up['Gen_Funny']
chat_up_['Gen_Sex'] = chat_up['Gen_Sex']

In [46]:
ml02 = smf.mnlogit('Successx ~ Funny + Sex + Good_Mate +Genderx+Gen_Funny+ Gen_Sex+log_Funny+log_Sex+log_Good_Mate', chat_up_).fit()
print(ml02.summary())

Optimization terminated successfully.
         Current function value: 0.840330
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:               Successx   No. Observations:                 1009
Model:                        MNLogit   Df Residuals:                      989
Method:                           MLE   Df Model:                           18
Date:                Wed, 16 Sep 2020   Pseudo R-squ.:                  0.1510
Time:                        01:26:34   Log-Likelihood:                -847.89
converged:                       True   LL-Null:                       -998.65
Covariance Type:            nonrobust   LLR p-value:                 2.342e-53
   Successx=1       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -2.4861      1.672     -1.487      0.137      -5.762       0.790
Funny             1.

## by seeing the log interaction term, its pretty clear that many values has significance p<0.05, hence the assumption of linearity of logit has been violated