# Machine Learning for Absolute Beginners

### Handling Categorical Variables

## 0. Import Libraries

In [1]:
import pandas as pd

import statsmodels.formula.api as smf


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## 1. Load and Verify Data

In [2]:
df = pd.read_csv("data/credit.csv")

In [3]:
df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Student,Married,Region,Balance
0,14.891,3606,283,2,34,11,No,No,Yes,South,333
1,106.025,6645,483,3,82,15,Yes,Yes,Yes,West,903
2,104.593,7075,514,4,71,11,No,No,No,West,580
3,148.924,9504,681,3,36,11,Yes,No,No,West,964
4,55.882,4897,357,2,68,16,No,No,Yes,South,331


## 2. Create Model Using StatsModels

In [4]:
model = smf.ols(formula='Balance ~ Income + Limit + Rating + Age + C(Married) + C(Student)', data=df).fit()

In [5]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                Balance   R-squared:                       0.953
Model:                            OLS   Adj. R-squared:                  0.952
Method:                 Least Squares   F-statistic:                     1321.
Date:                Thu, 17 Nov 2022   Prob (F-statistic):          5.82e-257
Time:                        14:52:35   Log-Likelihood:                -2408.9
No. Observations:                 400   AIC:                             4832.
Df Residuals:                     393   BIC:                             4860.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          -480.3385     26.08

## 3. Create Model Using Scikitlrn

### Create Dummy Variables for Specific Categorical Columns

In [6]:
categorical_cols = ['Student','Married']

In [7]:
df = pd.get_dummies(df, columns = categorical_cols)

In [8]:
df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Own,Region,Balance,Student_No,Student_Yes,Married_No,Married_Yes
0,14.891,3606,283,2,34,11,No,South,333,1,0,0,1
1,106.025,6645,483,3,82,15,Yes,West,903,0,1,0,1
2,104.593,7075,514,4,71,11,No,West,580,1,0,1,0
3,148.924,9504,681,3,36,11,Yes,West,964,1,0,1,0
4,55.882,4897,357,2,68,16,No,South,331,1,0,0,1


### Fit Model using Scikitlrn

In [13]:
y = df['Balance']
X = df[['Income','Limit','Rating','Student_Yes','Married_Yes']]

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3,random_state=4)

In [15]:
model = LinearRegression()
model.fit(X_train,y_train)

LinearRegression()

### Evaluate Model

In [12]:
print(model.score(X_test,y_test))
print(model.score(X_train,y_train))

0.9451331781163403
0.9547007615882039
