# 9. Intro to ML in Finance 

In [1]:
import pandas as pd
import numpy as np

In [2]:
tdf = pd.read_csv("data/data9_tips.csv")
tdf.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [3]:
import patsy as pts
#y & x are 2 seperate dataframes
y, x = pts.dmatrices('tip ~ total_bill + size + C(time)', data=tdf, return_type='dataframe')

In [4]:
x.head(3)

Unnamed: 0,Intercept,C(time)[T.Lunch],total_bill,size
0,1.0,0.0,16.99,2.0
1,1.0,0.0,10.34,3.0
2,1.0,0.0,21.01,3.0


In [5]:
#the scikit library
from sklearn.model_selection import train_test_split

#spliting the data into 4 parts: random_state is setting a seed
ind_train, ind_test, dep_train, dep_test = train_test_split(x,y,test_size=.5, random_state=110)

In [6]:
# linear regression(using sklearn instead of statsmodels library)
from sklearn.linear_model import LinearRegression

In [7]:
model = LinearRegression()
model.fit(ind_train, dep_train)  #estimating the model using the training data

model.intercept_

array([0.43713821])

In [8]:
model.coef_

array([[0.        , 0.12422268, 0.1107945 , 0.10891579]])

In [10]:
print(f"tip = {model.intercept_[0]:.3f} + " +
     f"{model.coef_[0][1]:.3f} Lunch + " +
     f"{model.coef_[0][2]:.3f} total_bill + " +
     f"{model.coef_[0][3]:.3f} size"
     )

tip = 0.437 + 0.124 Lunch + 0.111 total_bill + 0.109 size


In [12]:
#predictions
pred = model.predict(ind_test)
pred.shape

(122, 1)

In [13]:
dep_test.shape

(122, 1)

In [14]:
#score() for R-squared value
score = model.score(ind_test,dep_test)
print(f"LR Model Score = {score:.3f}")

LR Model Score = 0.378


In [15]:
# Linear regression using the statsmodel NOW (full sample)
import statsmodels.formula.api as smf

result = smf.ols(formula = 'tip ~ total_bill + size + C(time)', data=tdf).fit()  #OR result.fit() in the next line
result.summary()

0,1,2,3
Dep. Variable:,tip,R-squared:,0.468
Model:,OLS,Adj. R-squared:,0.461
Method:,Least Squares,F-statistic:,70.34
Date:,"Mon, 25 Mar 2024",Prob (F-statistic):,1.1300000000000001e-32
Time:,16:46:20,Log-Likelihood:,-347.98
No. Observations:,244,AIC:,704.0
Df Residuals:,240,BIC:,718.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6671,0.205,3.249,0.001,0.263,1.072
C(time)[T.Lunch],0.0041,0.148,0.028,0.978,-0.286,0.295
total_bill,0.0928,0.009,10.037,0.000,0.075,0.111
size,0.1926,0.085,2.253,0.025,0.024,0.361

0,1,2,3
Omnibus:,24.79,Durbin-Watson:,2.1
Prob(Omnibus):,0.0,Jarque-Bera (JB):,46.226
Skew:,0.546,Prob(JB):,9.17e-11
Kurtosis:,4.831,Cond. No.,73.7
