In [4]:
import numpy as np
import pandas as pd

# 1. Credit Data

In [5]:
c = pd.read_csv('./credit.tsv.bz2', '\t')

## 1.1

In [6]:
c.shape

(400, 12)

In [7]:
c.count()

ID           400
Income       400
Limit        400
Rating       400
Cards        400
Age          400
Education    400
Gender       400
Student      400
Married      400
Ethnicity    400
Balance      400
dtype: int64

In [8]:
c.dtypes

ID             int64
Income       float64
Limit          int64
Rating         int64
Cards          int64
Age            int64
Education      int64
Gender        object
Student       object
Married       object
Ethnicity     object
Balance        int64
dtype: object

## 1.2

In [9]:
import statsmodels.formula.api as smf

In [10]:
r = smf.ols(formula = 'Balance ~ Student', data = c).fit()

In [11]:
r.summary()

0,1,2,3
Dep. Variable:,Balance,R-squared:,0.067
Model:,OLS,Adj. R-squared:,0.065
Method:,Least Squares,F-statistic:,28.62
Date:,"Sun, 23 Feb 2020",Prob (F-statistic):,1.49e-07
Time:,20:36:24,Log-Likelihood:,-3005.5
No. Observations:,400,AIC:,6015.0
Df Residuals:,398,BIC:,6023.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,480.3694,23.434,20.499,0.000,434.300,526.439
Student[T.Yes],396.4556,74.104,5.350,0.000,250.771,542.140

0,1,2,3
Omnibus:,20.866,Durbin-Watson:,1.95
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21.92
Skew:,0.544,Prob(JB):,1.74e-05
Kurtosis:,2.637,Cond. No.,3.37


The predicted balance for someone who is not a student is 480.37. This number increased by 396.46 when the person is a student. This is statistically significant due to the p-value of 0.

## 1.3

In [13]:
r = smf.ols(formula = 'Balance ~ Ethnicity', data=c).fit()

In [14]:
r.summary()

0,1,2,3
Dep. Variable:,Balance,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.005
Method:,Least Squares,F-statistic:,0.04344
Date:,"Sun, 23 Feb 2020",Prob (F-statistic):,0.957
Time:,20:38:23,Log-Likelihood:,-3019.3
No. Observations:,400,AIC:,6045.0
Df Residuals:,397,BIC:,6057.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,531.0000,46.319,11.464,0.000,439.939,622.061
Ethnicity[T.Asian],-18.6863,65.021,-0.287,0.774,-146.515,109.142
Ethnicity[T.Caucasian],-12.5025,56.681,-0.221,0.826,-123.935,98.930

0,1,2,3
Omnibus:,28.829,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.395
Skew:,0.581,Prob(JB):,1.13e-06
Kurtosis:,2.46,Cond. No.,4.39


In [15]:
c.Ethnicity.value_counts()

Caucasian           199
Asian               102
African American     99
Name: Ethnicity, dtype: int64

Since there are 3 values of Ethnicity, but only two displayed in the regression model, this means that the missing value (African American), is represented by the intercept. Based on this, the balance of an African American is 531. This balance reduces by 18.69 for an Asian, and 12.5 for a Caucasian. However, only the balance for African Americans is statistically significant, as the other two have high p-values, indicating that this relationship could occur purely due to chance.

## 1.4

In [16]:
r = smf.ols(formula = 'Balance ~ Cards', data = c).fit()

In [17]:
r.summary()

0,1,2,3
Dep. Variable:,Balance,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,2.997
Date:,"Sun, 23 Feb 2020",Prob (F-statistic):,0.0842
Time:,20:43:12,Log-Likelihood:,-3017.9
No. Observations:,400,AIC:,6040.0
Df Residuals:,398,BIC:,6048.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,434.2861,54.569,7.958,0.000,327.006,541.566
Cards,28.9869,16.743,1.731,0.084,-3.929,61.903

0,1,2,3
Omnibus:,28.964,Durbin-Watson:,1.957
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26.603
Skew:,0.566,Prob(JB):,1.67e-06
Kurtosis:,2.437,Cond. No.,8.37


In this model, the balance of a person with no cards is 434.29, and for every additional card the person has, this balance increases by 28.99. Howver, there is a chance that this relationship occurs purely due to chance, indicated by the low but non-zero p-value.

In [18]:
r = smf.ols(formula = 'Balance ~ C(Cards)', data = c).fit()

In [19]:
r.summary()

0,1,2,3
Dep. Variable:,Balance,R-squared:,0.023
Model:,OLS,Adj. R-squared:,0.003
Method:,Least Squares,F-statistic:,1.144
Date:,"Sun, 23 Feb 2020",Prob (F-statistic):,0.332
Time:,20:45:47,Log-Likelihood:,-3014.7
No. Observations:,400,AIC:,6047.0
Df Residuals:,391,BIC:,6083.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,531.1373,64.286,8.262,0.000,404.748,657.527
C(Cards)[T.2],-58.1720,77.236,-0.753,0.452,-210.023,93.679
C(Cards)[T.3],-39.0742,77.663,-0.503,0.615,-191.763,113.615
C(Cards)[T.4],45.2794,84.024,0.539,0.590,-119.916,210.475
C(Cards)[T.5],-8.1373,101.645,-0.080,0.936,-207.977,191.702
C(Cards)[T.6],149.6809,152.622,0.981,0.327,-150.381,449.743
C(Cards)[T.7],497.6127,238.379,2.087,0.037,28.947,966.278
C(Cards)[T.8],106.8627,463.574,0.231,0.818,-804.547,1018.272
C(Cards)[T.9],-149.1373,463.574,-0.322,0.748,-1060.547,762.272

0,1,2,3
Omnibus:,28.038,Durbin-Watson:,1.928
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26.387
Skew:,0.568,Prob(JB):,1.86e-06
Kurtosis:,2.459,Cond. No.,22.5


Making Cards a categorical variable separates each value out into its own variable. In this model, we find that having no cards results in a balance of 531.14. If a person has two, three, or nine cards, this balance will reduce, while if they have four, six, seven, or eight cards this balance will increase. If this seems a little random, it is because non of these values (except the intercept -- 0 cards) is statstically significant, indicated by non-zero p-values across the board.

# 2. Implement Categorical to Dummies

In [32]:
c.index[c['Ethnicity'] == 'Asian']

Int64Index([  1,   2,   3,   7,  12,  17,  18,  19,  20,  31,
            ...
            349, 354, 357, 361, 364, 385, 387, 390, 392, 399],
           dtype='int64', length=102)

In [76]:
def makeDummies(var, name, ref):
    values = var.unique()
    data = pd.DataFrame({name + str(i): [0 for j in range(len(var))] for i in values})
    for i in values:
        # print(i)
        indices = c.index[var == i]
        for j in indices:
            data.loc[j, name + str(i)] = 1
    data = data.drop(name + ref, 1)
    return data

In [73]:
test1 = makeDummies(c.Student, 'student', 'No')
test1.head(10)

Unnamed: 0,studentYes
0,0
1,1
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,1


In [74]:
test2 = makeDummies(c.Ethnicity, 'ethn', 'Caucasian')
test2.head(10)

Unnamed: 0,ethnAsian,ethnAfrican American
0,0,0
1,1,0
2,1,0
3,1,0
4,0,0
5,0,0
6,0,1
7,1,0
8,0,0
9,0,1


In [78]:
test3 = makeDummies(c.Cards, '', '1')
test3.head(10)

Unnamed: 0,2,3,4,5,6,7,8,9
0,1,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0
6,1,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0
8,0,0,0,1,0,0,0,0
9,0,1,0,0,0,0,0,0
