In [None]:
import wbgapi as wb
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import yfinance as yf
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# Question 1

For this question use the World Bank Data for Turkey for the following indicators. Use [wbgapi](https://pypi.org/project/wbgapi/) for getting the data.

* [Literacy rate, adult female (SE.ADT.LITR.FE.ZS)](https://data.worldbank.org/indicator/SE.ADT.LITR.FE.ZS)
* [Labor force, female (SL.TLF.TOTL.FE.ZS)](https://data.worldbank.org/indicator/SL.TLF.TOTL.FE.ZS)
* [Poverty headcount ratio at national poverty lines (SI.POV.NAHC)](https://data.worldbank.org/indicator/SI.POV.NAHC)
* [Current health expenditure per capita (SH.XPD.CHEX.PC.CD)](https://data.worldbank.org/indicator/SH.XPD.CHEX.PC.CD)
* [GDP per capita (NY.GDP.PCAP.CD)](https://data.worldbank.org/indicator/NY.GDP.PCAP.CD)
* [Mortality rate, under-5 (SH.DYN.MORT)](https://data.worldbank.org/indicator/SH.DYN.MORT)


Using the [statsmodels](https://www.statsmodels.org/stable/index.html) library write the best linear regression model using child mortality as the dependent variable while the rest are considered as independent variables. Pay particular attention to the fact that the order of the variables put into the model significantly impacts the performance of the model. Choose the best model by considering

* with the minimum number of variables and their interactions,
* with the optimal ordering of the independent variables and their interactions,
* $R^2$-score of the model,
* statistical significance of the model coefficients,
* ANOVA analysis of the model.


I create the function **fetch_data** in order to get data from The World Bank according to indicator and name of the data. I limit my selections by Turkey. 

In [8]:
def fetch_data(link, name):
    data1 = pd.DataFrame(list(wb.data.fetch(link)))
    data2 = data1[['time','value']][data1['economy']=='TUR']
    data2.index = data2.time
    del data2['time']
    data2.columns = [[name]]
    return data2.dropna()

In [9]:
literacy = fetch_data('SE.ADT.LITR.FE.ZS','literacy')
literacy

Unnamed: 0_level_0,literacy
time,Unnamed: 1_level_1
YR2019,94.424042
YR2017,93.498268
YR2016,93.563011
YR2015,92.645813
YR2014,92.401817
YR2013,92.138077
YR2012,91.604523
YR2011,90.310097
YR2010,88.073174
YR2009,85.34716


In [17]:
labor = fetch_data('SL.TLF.TOTL.FE.ZS','labor')
labor

Unnamed: 0_level_0,labor
time,Unnamed: 1_level_1
YR2021,32.470214
YR2020,32.175606
YR2019,33.360649
YR2018,33.089766
YR2017,32.799757
YR2016,32.250459
YR2015,31.719798
YR2014,31.027784
YR2013,31.199391
YR2012,30.464131


In [26]:
poverty = fetch_data('SI.POV.NAHC','poverty')
poverty

Unnamed: 0_level_0,poverty
time,Unnamed: 1_level_1
YR2019,15.0
YR2018,14.4
YR2017,13.9
YR2016,13.5
YR2015,14.3
YR2014,14.7
YR2013,15.0
YR2012,15.0
YR2011,16.3
YR2010,16.1


In [31]:
health = fetch_data('SH.XPD.CHEX.PC.CD','health')
health

Unnamed: 0_level_0,health
time,Unnamed: 1_level_1
YR2019,396.466827
YR2018,389.86557
YR2017,442.617615
YR2016,466.7948
YR2015,453.116486
YR2014,525.844727
YR2013,551.401245
YR2012,524.250305
YR2011,531.418579
YR2010,539.327148


In [35]:
gdp = fetch_data('NY.GDP.PCAP.CD','gdp')
gdp

Unnamed: 0_level_0,gdp
time,Unnamed: 1_level_1
YR2021,9586.612450
YR2020,8536.433320
YR2019,9121.515167
YR2018,9454.348443
YR2017,10589.667725
...,...
YR1964,369.583469
YR1963,350.662985
YR1962,309.446624
YR1961,283.828284


In [39]:
mortality = fetch_data('SH.DYN.MORT','mortality')
mortality

Unnamed: 0_level_0,mortality
time,Unnamed: 1_level_1
YR2020,9.5
YR2019,10.1
YR2018,10.7
YR2017,11.4
YR2016,12.1
...,...
YR1964,225.7
YR1963,233.5
YR1962,241.4
YR1961,249.3


I join all 6 categories of data in a data frame so that I can create a linear regression model

In [40]:
df = literacy.join([labor,poverty,health,gdp,mortality])
df.dropna(inplace=True)
df

Unnamed: 0_level_0,literacy,labor,poverty,health,gdp,mortality
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
YR2019,94.424042,33.360649,15.0,396.466827,9121.515167,10.1
YR2017,93.498268,32.799757,13.9,442.617615,10589.667725,11.4
YR2016,93.563011,32.250459,13.5,466.7948,10894.603378,12.1
YR2015,92.645813,31.719798,14.3,453.116486,11006.279524,13.0
YR2014,92.401817,31.027784,14.7,525.844727,12157.990434,13.8
YR2013,92.138077,31.199391,15.0,551.401245,12614.78161,14.7
YR2012,91.604523,30.464131,15.0,524.250305,11795.633457,15.8
YR2011,90.310097,29.832127,16.3,531.418579,11420.555456,16.9
YR2010,88.073174,29.161917,16.1,539.327148,10742.774979,18.1
YR2009,85.34716,28.071295,16.9,500.193054,9103.474051,19.5


I check to see how each variable is in corrrelation to one another and consider these results when I am creating models.

* Literacy-labor are highly correlated in the same direction (0.98)
* Literacy-mortality are highly correlated in the opposite direction (-0.98)
* Mortality-labor are highly correlated in the opposite direction (-0.99)

In [43]:
df.corr().style.background_gradient(cmap='bone_r').set_precision(2)

Unnamed: 0,"('literacy',)",Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
"('literacy',)",1.0,0.98,-0.85,0.24,0.71,-0.98
0.98,1.0,-0.87,0.1,0.59,-0.99,
-0.85,-0.87,1.0,-0.19,-0.64,0.89,
0.24,0.1,-0.19,1.0,0.79,-0.17,
0.71,0.59,-0.64,0.79,1.0,-0.62,
-0.98,-0.99,0.89,-0.17,-0.62,1.0,


The first model I tried gives a good R^2 value as 0.992. Then ,analyzing the coefficients, 4 out of 6 of them have clear confidence intervals, we can see whether they are positive or negative. Literacy and poverty coefficient are unclear. 

Anova table tells us which variable has more effect on the target variable. In this case, literacy is highly important then, labor. GDP is the least important.

This is not a bad model considering we have a good R^2 and some of the coefficients are understandable. There is also no interaction between independent variables.

In [44]:
model = ols('mortality ~ literacy + labor + poverty + health + gdp', data=df).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.986
Method:                 Least Squares   F-statistic:                     169.6
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           3.81e-07
Time:                        13:34:39   Log-Likelihood:                -7.8492
No. Observations:                  13   AIC:                             27.70
Df Residuals:                       7   BIC:                             31.09
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     69.8438      9.987      6.994      0.0



Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
literacy,1.0,296.579278,296.579278,815.327082,1.661736e-08
labor,1.0,8.414644,8.414644,23.132725,0.001944516
poverty,1.0,0.940111,0.940111,2.584462,0.1519524
health,1.0,1.829951,1.829951,5.030724,0.05981897
gdp,1.0,0.769732,0.769732,2.116072,0.1890863
Residual,7.0,2.546285,0.363755,,


Since literacy and labor are both important in determining mortality, I decided to create a model that interacts them. Now, R^2 is a tiny bit higher BUT coefficient confidence intervals are really bad. It is not possible to determine whether they are on the positive side or negative side. Also, looking at  the anova table it seems that the literacy and labor interaction is not that important.

This is not an ideal model

In [48]:
model = ols('mortality ~ literacy * labor + poverty + health + gdp', data=df).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.993
Model:                            OLS   Adj. R-squared:                  0.985
Method:                 Least Squares   F-statistic:                     136.1
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           3.84e-06
Time:                        13:45:55   Log-Likelihood:                -7.0988
No. Observations:                  13   AIC:                             28.20
Df Residuals:                       6   BIC:                             32.15
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept        -39.7571    128.314     -0.



Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
literacy,1.0,296.579278,296.579278,784.369686,1.37105e-07
labor,1.0,8.414644,8.414644,22.254391,0.00326659
literacy:labor,1.0,0.741459,0.741459,1.960954,0.2109445
poverty,1.0,0.977511,0.977511,2.585246,0.1589881
health,1.0,1.057675,1.057675,2.797255,0.1454551
gdp,1.0,1.040763,1.040763,2.752529,0.1481718
Residual,6.0,2.26867,0.378112,,


By creating an interaction between health and labor, we obtain an amazing R^2 score AND suddenly health's importance based on the anova table is really high. However, the coefficients are not ideal.

In [51]:
model = ols('mortality ~ health * labor + poverty + literacy + gdp', data=df).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.996
Model:                            OLS   Adj. R-squared:                  0.993
Method:                 Least Squares   F-statistic:                     274.8
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           4.74e-07
Time:                        13:51:03   Log-Likelihood:                -2.5553
No. Observations:                  13   AIC:                             19.11
Df Residuals:                       6   BIC:                             23.07
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      123.8915     20.942      5.916   



Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
health,1.0,9.368715,9.368715,49.845732,0.0004042581
labor,1.0,297.31357,297.31357,1581.840396,1.688494e-08
health:labor,1.0,2.684303,2.684303,14.281684,0.009189925
poverty,1.0,0.457567,0.457567,2.434458,0.1697143
literacy,1.0,0.124926,0.124926,0.664663,0.4460658
gdp,1.0,0.003194,0.003194,0.016993,0.9005434
Residual,6.0,1.127725,0.187954,,


Health-literacy interaction model's R^2 value is even better. Though the coefficients are still not ideal.

In [53]:
model = ols('mortality ~ health * literacy + poverty + labor + gdp', data=df).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.993
Method:                 Least Squares   F-statistic:                     303.4
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           3.53e-07
Time:                        13:51:30   Log-Likelihood:                -1.9152
No. Observations:                  13   AIC:                             17.83
Df Residuals:                       6   BIC:                             21.79
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept         153.8300     28.894     



Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
health,1.0,9.368715,9.368715,55.004799,0.0003089719
literacy,1.0,288.470118,288.470118,1693.641051,1.3766e-08
health:literacy,1.0,9.974526,9.974526,58.561584,0.0002600939
poverty,1.0,1.529089,1.529089,8.977454,0.02412501
labor,1.0,0.713595,0.713595,4.189598,0.08661886
gdp,1.0,0.002005,0.002005,0.011771,0.9171424
Residual,6.0,1.021952,0.170325,,


Even though the R^2 value is very good, this is not a good model. There are 3 interactions and the coefficients are unclear.

In [56]:
model = ols('mortality ~ health * literacy * labor + gdp + poverty', data=df).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.998
Model:                            OLS   Adj. R-squared:                  0.991
Method:                 Least Squares   F-statistic:                     149.8
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           0.000808
Time:                        13:53:42   Log-Likelihood:                0.63046
No. Observations:                  13   AIC:                             18.74
Df Residuals:                       3   BIC:                             24.39
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept              1382.08



Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
health,1.0,9.368715,9.368715,40.687021,0.007801
literacy,1.0,288.470118,288.470118,1252.785395,5e-05
health:literacy,1.0,9.974526,9.974526,43.317973,0.007137
labor,1.0,1.777539,1.777539,7.719604,0.069082
health:labor,1.0,0.151793,0.151793,0.659217,0.476277
literacy:labor,1.0,0.082741,0.082741,0.359331,0.591128
health:literacy:labor,1.0,0.449412,0.449412,1.951732,0.256804
gdp,1.0,0.111639,0.111639,0.484831,0.536341
poverty,1.0,0.002729,0.002729,0.01185,0.920188
Residual,3.0,0.690789,0.230263,,


In [59]:
model = ols('mortality ~ health + literacy + labor + gdp * poverty', data=df).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:              mortality   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.984
Method:                 Least Squares   F-statistic:                     128.0
Date:                Mon, 07 Nov 2022   Prob (F-statistic):           4.61e-06
Time:                        13:57:27   Log-Likelihood:                -7.4976
No. Observations:                  13   AIC:                             29.00
Df Residuals:                       6   BIC:                             32.95
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      56.0483     26.095      2.148      



Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
health,1.0,9.368715,9.368715,23.303087,0.002918195
literacy,1.0,288.470118,288.470118,717.520383,1.78775e-07
labor,1.0,9.764674,9.764674,24.287968,0.00263476
gdp,1.0,0.262535,0.262535,0.653011,0.4498815
poverty,1.0,0.667673,0.667673,1.660723,0.2449616
gdp:poverty,1.0,0.13406,0.13406,0.333451,0.584636
Residual,6.0,2.412225,0.402038,,


### Overall,
Analyzing all these linear regression models so far, the most ideal one would be the literacy-health interaction one, with an R^2 value of 0.997 and 4/7 of the coefficients' confidence intervals are clear.

# Question 2

For this question use Yahoo's Finance API for the following tickers:

* Gold futures (GC=F)
* Silver futures (SI=F)
* Copper futures (HG=F)
* Platinum futures (PL=F)

1. Write the best linear regression model that explains gold futures closing prices in terms of opening prices of gold, silver, copper, and platinum futures.
2. Repeat the same for silver, copper and platinum prices.
3. Compare the models you obtained in Steps 1 and 2. Which model is better? How do you decide? Explain.

In [2]:
gold = yf.download('GC=F')
silver = yf.download('SI=F')
copper = yf.download('HG=F')
platinum = yf.download('PL=F')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


# Q2.1
I declare an empty dictionary tmp that contains each data in stated form such as closing prices or opening prices. to explain gold futures closing prices using linear regression, I extract gold's closing and opening prices; silver, copper, platinum's opening prices into the dictionary. Then turn them to a data frame with the data as index.

In [7]:
tmp = {}
tmp['gold'] = gold['Close']
tmp['gold2'] = gold['Open']
tmp['silver'] = silver['Open']
tmp['copper'] = copper['Open']
tmp['platinum'] = platinum['Open']
data1 = pd.DataFrame(tmp).dropna()
data1

Unnamed: 0_level_0,gold,gold2,silver,copper,platinum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-08-30 00:00:00-04:00,273.899994,273.899994,4.950000,0.8790,593.900024
2000-08-31 00:00:00-04:00,278.299988,274.799988,4.920000,0.8850,589.000000
2000-09-01 00:00:00-04:00,277.000000,277.000000,5.035000,0.8780,588.000000
2000-09-05 00:00:00-04:00,275.799988,275.799988,4.990000,0.8960,602.000000
2000-09-06 00:00:00-04:00,274.200012,274.200012,5.000000,0.9050,603.000000
...,...,...,...,...,...
2022-10-31 00:00:00-04:00,1635.900024,1641.800049,19.200001,3.4095,939.400024
2022-11-01 00:00:00-04:00,1645.000000,1630.800049,19.125000,3.4945,959.799988
2022-11-02 00:00:00-04:00,1645.699951,1650.800049,19.780001,3.4985,960.200012
2022-11-03 00:00:00-04:00,1627.300049,1629.199951,19.235001,3.4455,933.400024


The first model I tried for gold future's closing prices, I obtained a perfect R^2. But checking the confidence intervals for the coefficients, only gold's opening price coefficient is clear(positive). Also, judging by the anova table, gold's opening prices play the biggest part in gold's closing prices in this model.

So, not a good model

In [8]:
model = ols('gold ~ gold2 + silver + copper + platinum', data=data1).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:                   gold   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 2.513e+06
Date:                Mon, 07 Nov 2022   Prob (F-statistic):               0.00
Time:                        17:57:03   Log-Likelihood:                -18728.
No. Observations:                4864   AIC:                         3.747e+04
Df Residuals:                    4859   BIC:                         3.750e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.4630      0.660      0.701      0.4

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
gold2,1.0,1301874000.0,1301874000.0,10053740.0,0.0
silver,1.0,0.1313139,0.1313139,0.001014073,0.974597
copper,1.0,295.3442,295.3442,2.280798,0.131049
platinum,1.0,96.25318,96.25318,0.743316,0.388643
Residual,4859.0,629199.7,129.4916,,


The model I obtained by interacting gold and silver opening prices, is slightly better. Gold's opening prices is still the most influential.

In [16]:
model = ols('gold ~ gold2 * silver + copper + platinum', data=data1).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:                   gold   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 2.012e+06
Date:                Mon, 07 Nov 2022   Prob (F-statistic):               0.00
Time:                        18:00:24   Log-Likelihood:                -18725.
No. Observations:                4864   AIC:                         3.746e+04
Df Residuals:                    4858   BIC:                         3.750e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -0.2254      0.730     -0.309   

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
gold2,1.0,1301874000.0,1301874000.0,10061720.0,0.0
silver,1.0,0.1313139,0.1313139,0.001014878,0.974587
gold2:silver,1.0,444.6661,444.6661,3.436664,0.063825
copper,1.0,69.90486,69.90486,0.5402694,0.462357
platinum,1.0,505.4809,505.4809,3.906679,0.048151
Residual,4858.0,628571.2,129.3889,,


# Q2.2
Next, let's explain silver's closing prices in terms of others' opening prices

In [19]:
tmp = {}
tmp['gold'] = gold['Open']
tmp['silver'] = silver['Close']
tmp['silver2'] = silver['Open']
tmp['copper'] = copper['Open']
tmp['platinum'] = platinum['Open']
data2 = pd.DataFrame(tmp).dropna()
data2

Unnamed: 0_level_0,gold,silver,silver2,copper,platinum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-08-30 00:00:00-04:00,273.899994,4.930000,4.950000,0.8790,593.900024
2000-08-31 00:00:00-04:00,274.799988,5.003000,4.920000,0.8850,589.000000
2000-09-01 00:00:00-04:00,277.000000,5.004000,5.035000,0.8780,588.000000
2000-09-05 00:00:00-04:00,275.799988,4.998000,4.990000,0.8960,602.000000
2000-09-06 00:00:00-04:00,274.200012,4.983000,5.000000,0.9050,603.000000
...,...,...,...,...,...
2022-10-31 00:00:00-04:00,1641.800049,19.125000,19.200001,3.4095,939.400024
2022-11-01 00:00:00-04:00,1630.800049,19.673000,19.125000,3.4945,959.799988
2022-11-02 00:00:00-04:00,1650.800049,19.600000,19.780001,3.4985,960.200012
2022-11-03 00:00:00-04:00,1629.199951,19.436001,19.235001,3.4455,933.400024


The model below, has a good R^2 value but whether the coefficients are negative or positive is unclear. Looking at the anova table we can see that gold's opening prices are very influential.

In [20]:
model = ols('silver ~ gold + silver2 + copper + platinum', data=data2).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:                 silver   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 9.070e+05
Date:                Mon, 07 Nov 2022   Prob (F-statistic):               0.00
Time:                        18:05:18   Log-Likelihood:                -1307.7
No. Observations:                4864   AIC:                             2625.
Df Residuals:                    4859   BIC:                             2658.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0357      0.018     -1.942      0.0

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
gold,1.0,275713.597128,275713.597128,2747696.0,0.0
silver2,1.0,88328.551561,88328.551561,880261.4,0.0
copper,1.0,0.524118,0.524118,5.223231,0.02233
platinum,1.0,0.805024,0.805024,8.022674,0.004639
Residual,4859.0,487.569276,0.100344,,


The model I created by interacting copper and silver opening prices is better coefficient analysis wise.

In [25]:
model = ols('silver ~ silver2 * copper + gold + platinum', data=data2).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:                 silver   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 7.265e+05
Date:                Mon, 07 Nov 2022   Prob (F-statistic):               0.00
Time:                        18:09:50   Log-Likelihood:                -1304.3
No. Observations:                4864   AIC:                             2621.
Df Residuals:                    4858   BIC:                             2659.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -0.0094      0.021     -0.

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
silver2,1.0,364042.053741,364042.053741,3632274.0,0.0
copper,1.0,0.617011,0.617011,6.156298,0.013128
silver2:copper,1.0,0.010612,0.010612,0.1058861,0.744891
gold,1.0,0.012597,0.012597,0.1256904,0.722958
platinum,1.0,1.463557,1.463557,14.60282,0.000134
Residual,4858.0,486.889588,0.100224,,


Next, let's explain copper's closing prices in terms of others' opening prices

In [26]:
tmp = {}
tmp['gold'] = gold['Open']
tmp['silver'] = silver['Open']
tmp['copper'] = copper['Close']
tmp['copper2'] = copper['Open']
tmp['platinum'] = platinum['Open']
data3 = pd.DataFrame(tmp).dropna()
data3

Unnamed: 0_level_0,gold,silver,copper,copper2,platinum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-08-30 00:00:00-04:00,273.899994,4.950000,0.8850,0.8790,593.900024
2000-08-31 00:00:00-04:00,274.799988,4.920000,0.8850,0.8850,589.000000
2000-09-01 00:00:00-04:00,277.000000,5.035000,0.8890,0.8780,588.000000
2000-09-05 00:00:00-04:00,275.799988,4.990000,0.9060,0.8960,602.000000
2000-09-06 00:00:00-04:00,274.200012,5.000000,0.9015,0.9050,603.000000
...,...,...,...,...,...
2022-10-31 00:00:00-04:00,1641.800049,19.200001,3.4135,3.4095,939.400024
2022-11-01 00:00:00-04:00,1630.800049,19.125000,3.5095,3.4945,959.799988
2022-11-02 00:00:00-04:00,1650.800049,19.780001,3.5055,3.4985,960.200012
2022-11-03 00:00:00-04:00,1629.199951,19.235001,3.4565,3.4455,933.400024


Good R^2 value, not very good confidence intervals.

In [27]:
model = ols('copper ~ gold + silver + copper2 + platinum', data=data3).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:                 copper   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 1.032e+06
Date:                Mon, 07 Nov 2022   Prob (F-statistic):               0.00
Time:                        18:12:15   Log-Likelihood:                 8985.2
No. Observations:                4864   AIC:                        -1.796e+04
Df Residuals:                    4859   BIC:                        -1.793e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0021      0.002     -0.935      0.3

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
gold,1.0,4140.041915,4140.041915,2841664.0,0.0
silver,1.0,432.305043,432.305043,296727.9,0.0
copper2,1.0,1440.124574,1440.124574,988480.4,0.0
platinum,1.0,0.009214,0.009214,6.32438,0.011941
Residual,4859.0,7.079114,0.001457,,


Now, along with R^2, confidence intervals are not bad either. We can somewhat see clearly whether they are negative or positive

In [31]:
model = ols('copper ~ gold * platinum + silver + copper2', data=data3).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:                 copper   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 8.260e+05
Date:                Mon, 07 Nov 2022   Prob (F-statistic):               0.00
Time:                        18:24:18   Log-Likelihood:                 8987.5
No. Observations:                4864   AIC:                        -1.796e+04
Df Residuals:                    4858   BIC:                        -1.792e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -0.0111      0.005     -2.331

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
gold,1.0,4140.041915,4140.041915,2843763.0,0.0
platinum,1.0,865.855165,865.855165,594749.2,0.0
gold:platinum,1.0,140.731773,140.731773,96667.56,0.0
silver,1.0,87.536533,87.536533,60128.17,0.0
copper2,1.0,778.32204,778.32204,534623.4,0.0
Residual,4858.0,7.072434,0.001456,,


Next, let's explain platinum's closing prices in terms of others' opening prices

In [32]:
tmp = {}
tmp['gold'] = gold['Open']
tmp['silver'] = silver['Open']
tmp['copper'] = copper['Open']
tmp['platinum'] = platinum['Close']
tmp['platinum2'] = platinum['Open']
data4 = pd.DataFrame(tmp).dropna()
data4

Unnamed: 0_level_0,gold,silver,copper,platinum,platinum2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-08-30 00:00:00-04:00,273.899994,4.950000,0.8790,591.400024,593.900024
2000-08-31 00:00:00-04:00,274.799988,4.920000,0.8850,586.700012,589.000000
2000-09-01 00:00:00-04:00,277.000000,5.035000,0.8780,595.299988,588.000000
2000-09-05 00:00:00-04:00,275.799988,4.990000,0.8960,601.299988,602.000000
2000-09-06 00:00:00-04:00,274.200012,5.000000,0.9050,611.099976,603.000000
...,...,...,...,...,...
2022-10-31 00:00:00-04:00,1641.800049,19.200001,3.4095,939.400024,939.400024
2022-11-01 00:00:00-04:00,1630.800049,19.125000,3.4945,959.799988,959.799988
2022-11-02 00:00:00-04:00,1650.800049,19.780001,3.4985,960.200012,960.200012
2022-11-03 00:00:00-04:00,1629.199951,19.235001,3.4455,933.400024,933.400024


This seems like a bad model even though R^2 is good, confidence intervals are all over the place...

In [43]:
model = ols('platinum ~ gold + silver + copper + platinum2', data=data4).fit()
print(model.summary())
sm.stats.anova_lm(model)

0,1,2,3
Dep. Variable:,platinum,R-squared:,0.999
Model:,OLS,Adj. R-squared:,0.999
Method:,Least Squares,F-statistic:,2078000.0
Date:,"Mon, 31 Oct 2022",Prob (F-statistic):,0.0
Time:,12:41:32,Log-Likelihood:,-17493.0
No. Observations:,4860,AIC:,35000.0
Df Residuals:,4855,BIC:,35030.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8717,0.514,1.697,0.090,-0.135,1.879
gold,-0.0013,0.001,-1.915,0.056,-0.003,3.23e-05
silver,0.0646,0.046,1.405,0.160,-0.026,0.155
copper,0.1796,0.280,0.642,0.521,-0.369,0.728
platinum2,0.9994,0.001,1349.092,0.000,0.998,1.001

0,1,2,3
Omnibus:,2694.682,Durbin-Watson:,1.92
Prob(Omnibus):,0.0,Jarque-Bera (JB):,434128.853
Skew:,-1.597,Prob(JB):,0.0
Kurtosis:,49.191,Cond. No.,6720.0


The 2 models below, with interactions are pretty good.

In [35]:
model = ols('platinum ~ gold * copper + silver + platinum2', data=data4).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:               platinum   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 1.666e+06
Date:                Mon, 07 Nov 2022   Prob (F-statistic):               0.00
Time:                        18:28:16   Log-Likelihood:                -17502.
No. Observations:                4864   AIC:                         3.502e+04
Df Residuals:                    4858   BIC:                         3.506e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.2881      0.573      0.503      

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
gold,1.0,163697100.0,163697100.0,2091862.0,0.0
copper,1.0,225510000.0,225510000.0,2881761.0,0.0
gold:copper,1.0,14537650.0,14537650.0,185774.7,0.0
silver,1.0,148717600.0,148717600.0,1900442.0,0.0
platinum2,1.0,99463770.0,99463770.0,1271034.0,0.0
Residual,4858.0,380159.0,78.25422,,


In [36]:
model = ols('platinum ~ gold * platinum2 + silver + copper', data=data4).fit()
print(model.summary())
sm.stats.anova_lm(model)

                            OLS Regression Results                            
Dep. Variable:               platinum   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 1.669e+06
Date:                Mon, 07 Nov 2022   Prob (F-statistic):               0.00
Time:                        18:28:44   Log-Likelihood:                -17499.
No. Observations:                4864   AIC:                         3.501e+04
Df Residuals:                    4858   BIC:                         3.505e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -2.5797      1.107     -2.

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
gold,1.0,163697100.0,163697100.0,2094928.0,0.0
platinum2,1.0,488228400.0,488228400.0,6248148.0,0.0
gold:platinum2,1.0,131.5451,131.5451,1.683461,0.194527
silver,1.0,919.8612,919.8612,11.77201,0.000606
copper,1.0,89.65147,89.65147,1.147323,0.284163
Residual,4858.0,379602.7,78.1397,,


# Q2.3
I noted my comparisons as I go and try different models above. These models, mostly have a perfect R^2 value but the coefficients' confidence intervals are not ideal. When choosing a good model, these are the first 2 attributes I check. So my goal when creating models for these data sets was to find at least 4 coefficients which I could determine if they are negative or positive. If only 3 of them were clear, I called that model bad and tried others.

It is also interesting to note that most of the time gold's opening price is the most influential when trying to explain another's closing price.

# Question 3

1. Write a function that takes a ticker symbol and returns a pandas dataframe that for each day puts a 1 when the closing price is higher than the opening price, a 0 when the closing price is lower than the opening price.
2. Write the best logistic regression that predicts the time series you obtain from Step 1 for gold futures against the opening prices of gold, silver, copper, and platinum prices.
3. Repeat the same for silver, copper, and platinum prices.
4. Compare the models you obtained from Steps 2 and 3. Decide which is the best model, and explain your reasoning.
5. Does any of the models provide a good fit? Explain.

# Q3.1
y_finance function takes the ticker as input and after fetching data from yahoo's finance, it returns the wanted data frame along with a new column, Comparison. Comparison is 0 if the closing price is lower than the opening, 1 if it is higher and leaves it as null if neither.

In [6]:
def y_finance(ticker):
    df = yf.download(ticker);
    df['Comparison'] = np.nan
    for i in range(len(df)):
        if df['Close'][i] > df['Open'][i]:
            df['Comparison'][i]=1
        elif df['Close'][i] < df['Open'][i]:
            df['Comparison'][i]=0
    return df

The log_reg_predict function predicts y against X and takes train size and iteration number from the user.

In [14]:
def log_reg_predict(X,y,tsize,it):
    X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=tsize)

    model = LogisticRegression(max_iter=it)
    model.fit(X_train,y_train)

    print(model.score(X_test,y_test))

    y_predict = model.predict(X_test)
    print(confusion_matrix(y_test,y_predict))

The bootstrap function also does prediction BUT the difference is instead of me running the same code several times, bootstrap does it and keeps the model scores. In the end I can see the model score range limited by the minimum and maximum

> The reference for the bootstrap function is *Lecture 6 of MAT388E*

In [7]:
def bootstrap(X,y,tsize,it):
    res = []
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=tsize)
        model = LogisticRegression(max_iter=it)
        model.fit(X_train,y_train)
        res.append(model.score(X_test,y_test))
    tmp = sorted(res)[3:97] 
    return (min(tmp),max(tmp))

# Q3.2 
Trying to find a good logistic regression model that predicts the time series for gold futures against the opening prices of gold, silver, copper, and platinum prices. The null values from earlier get dropped here. The length was 5568 and now it is 4037

In [12]:
tmp = {}
tmp['gold'] = y_finance('GC=F')['Open']
tmp['gold2'] = y_finance('GC=F')['Comparison']
tmp['silver'] = y_finance('SI=F')['Open']
tmp['copper'] = y_finance('HG=F')['Open']
tmp['platinum'] = y_finance('PL=F')['Open']
gdata = pd.DataFrame(tmp).dropna()
gdata

[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


Unnamed: 0_level_0,gold,gold2,silver,copper,platinum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-08-31 00:00:00-04:00,274.799988,1.0,4.920000,0.8850,589.000000
2000-09-28 00:00:00-04:00,277.500000,0.0,4.953000,0.9140,575.000000
2000-09-29 00:00:00-04:00,274.600006,0.0,4.870000,0.9180,578.000000
2000-10-02 00:00:00-04:00,272.799988,1.0,4.906000,0.9140,575.000000
2000-10-03 00:00:00-04:00,272.100006,0.0,4.913000,0.8900,581.000000
...,...,...,...,...,...
2022-11-01 00:00:00-04:00,1630.800049,1.0,19.125000,3.4945,959.799988
2022-11-02 00:00:00-04:00,1650.800049,0.0,19.780001,3.4985,960.200012
2022-11-03 00:00:00-04:00,1629.199951,0.0,19.235001,3.4455,933.400024
2022-11-04 00:00:00-04:00,1630.199951,1.0,19.980000,3.6370,969.799988


In [18]:
X = gdata[['gold', 'silver', 'copper', 'platinum']]
y = gdata['gold2']

The model below, is not good. As I run the code multiple times I get a bad model score and a bad confusion matrix. Label 0 predictions is terrible and label 1 predictions is slightly better than label 0, but still very bad.

The boostrap function tells me that for these values for iteration number and train size, the model score range is bad.

In [24]:
log_reg_predict(X,y,0.75,1500)
bootstrap(X,y,0.75,1500)

0.48514851485148514
[[136 362]
 [158 354]]


(0.48118811881188117, 0.5277227722772277)

I try different models, but the results is still not good. 

In [25]:
log_reg_predict(X,y,0.5,1500)
bootstrap(X,y,0.5,1500)

0.49826646854878653
[[106 889]
 [124 900]]


(0.4814264487369985, 0.5200594353640416)

In [27]:
log_reg_predict(X,y,0.5,5500)
bootstrap(X,y,0.5,5500)

0.5111441307578009
[[389 589]
 [398 643]]


(0.48340762753838534, 0.5200594353640416)

# Q3.3
Silver:

In [28]:
tmp = {}
tmp['gold'] = y_finance('GC=F')['Open']
tmp['silver'] = y_finance('SI=F')['Open']
tmp['silver2'] = y_finance('SI=F')['Comparison']
tmp['copper'] = y_finance('HG=F')['Open']
tmp['platinum'] = y_finance('PL=F')['Open']
sdata = pd.DataFrame(tmp).dropna()

[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


In [29]:
X = sdata[['gold', 'silver', 'copper', 'platinum']]
y = sdata['silver2']

In [30]:
log_reg_predict(X,y,0.75,1500)
bootstrap(X,y,0.75,1500)

0.5371702637889688
[[293 109]
 [277 155]]


(0.48081534772182255, 0.5467625899280576)

In [31]:
log_reg_predict(X,y,0.5,1500)
bootstrap(X,y,0.5,1500)

0.5038992201559688
[[382 472]
 [355 458]]


(0.48830233953209357, 0.5332933413317337)

In [32]:
log_reg_predict(X,y,0.5,5500)
bootstrap(X,y,0.5,5500)

0.4991001799640072
[[366 502]
 [333 466]]


(0.48830233953209357, 0.5290941811637673)

The first one gives the best range of scores so far (0.48, 0.55) BUT it is still not a good model.

Now, copper:

In [33]:
tmp = {}
tmp['gold'] = y_finance('GC=F')['Open']
tmp['silver'] = y_finance('SI=F')['Open']
tmp['copper'] = y_finance('HG=F')['Open']
tmp['copper2'] = y_finance('HG=F')['Comparison']
tmp['platinum'] = y_finance('PL=F')['Open']
cdata = pd.DataFrame(tmp).dropna()

[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


In [34]:
X = cdata[['gold', 'silver', 'copper', 'platinum']]
y = cdata['copper2']

In [35]:
log_reg_predict(X,y,0.75,1500)
bootstrap(X,y,0.75,1500)

0.4969538729329852
[[ 39 535]
 [ 43 532]]


(0.4917319408181027, 0.5352480417754569)

In [36]:
log_reg_predict(X,y,0.5,1500)
bootstrap(X,y,0.5,1500)

0.5265448215839861
[[ 170  923]
 [ 165 1040]]


(0.49738903394255873, 0.5291557876414273)

In [37]:
log_reg_predict(X,y,0.5,5500)
bootstrap(X,y,0.5,5500)

0.5143603133159269
[[ 139  958]
 [ 158 1043]]


(0.49434290687554394, 0.525674499564839)

These models are terrible also.

Now platinum:

In [39]:
tmp = {}
tmp['gold'] = y_finance('GC=F')['Open']
tmp['silver'] = y_finance('SI=F')['Open']
tmp['copper'] = y_finance('HG=F')['Open']
tmp['platinum'] = y_finance('PL=F')['Open']
tmp['platinum2'] = y_finance('PL=F')['Comparison']
pdata = pd.DataFrame(tmp).dropna()

[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Comparison'][i]=0


In [40]:
X = pdata[['gold', 'silver', 'copper', 'platinum']]
y = pdata['platinum2']

In [41]:
log_reg_predict(X,y,0.75,1500)
bootstrap(X,y,0.75,1500)

0.5341098169717138
[[ 14 264]
 [ 16 307]]


(0.502495840266223, 0.5557404326123128)

In [42]:
log_reg_predict(X,y,0.5,1500)
bootstrap(X,y,0.5,1500)

0.5295587010824313
[[ 73 481]
 [ 84 563]]


(0.5120732722731057, 0.5512073272273106)

In [43]:
log_reg_predict(X,y,0.5,5500)
bootstrap(X,y,0.5,5500)

0.5262281432139884
[[ 20 543]
 [ 26 612]]


(0.5087427144046628, 0.5437135720233139)

The second model is the best one I obtained so far with a model score range of (51,55). I would not call it a good model, but it is better than other.

# Q3.4
Comparing what I have created so far, the best model out of all, is the one predicting time series for **platinum** futures. My reasoning is that, the confusion matrix for all of them are not good and shows that the labelling process is not accurate. But with the help of the bootstrap function, instead of running a single section of prediction, I do that 100 times and get a range of model scores. The model for the platinum with the iteration number=1500 and train size=%50 gives me a range of (0.51, 0.55) which shows that this is not a good model, but comparing  it to others, it is slightly better.

# Q3.5

None of the models provide a good fit. In my observation using logistic regression for this data set may not be a good idea as the results have a very low model score and bad confusion matrixes.

# Question 4

For this question use the following [data](https://archive.ics.uci.edu/ml/datasets/credit+approval):


In [3]:
credit = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data', header=None)

fn = {'+': 1, '-': 0}

X = credit.replace('?',0).iloc[:,[1,2,7,10,14]]
y = credit.iloc[:,15].map(lambda x: fn.get(x,0))

1. Split the data into training and test set.
2. Write different logistic regression models predicting y against X.
3. Construct [confusion matrices](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) on the test data set for these different models.
4. Analyze these models. Explain which model is the best model you have found.
5. Repeat Steps 1-4 several times. Does your best model stay as the best model? What should be the correct protocol to decide on the best model explaining the data?

In [3]:
X

Unnamed: 0,1,2,7,10,14
0,30.83,0.000,1.25,1,0
1,58.67,4.460,3.04,6,560
2,24.50,0.500,1.50,0,824
3,27.83,1.540,3.75,5,3
4,20.17,5.625,1.71,0,0
...,...,...,...,...,...
685,21.08,10.085,1.25,0,0
686,22.67,0.750,2.00,2,394
687,25.25,13.500,2.00,1,1
688,17.92,0.205,0.04,0,750


In [4]:
y

0      1
1      1
2      1
3      1
4      1
      ..
685    0
686    0
687    0
688    0
689    0
Name: 15, Length: 690, dtype: int64

# Q4.1/Q4.2/Q4.3/Q4.4

Creating and trying my first model I set the training set size as %75 and the iterations 1500.

By running this code several times I see that the score is decent, around %75 which tells me this is not a bad model. 

Looking at the confusion matrix I constructed, first row that represents label 0 gives good results as the correctly predicted number is way higher than the incorrectly ones. Second row that represents label 1 is very bad. The correct and incorrect predictions are almost the same.

(While rows represent the real labels; columns represent the predicted labels)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.75)

model = LogisticRegression(max_iter=1500)
model.fit(X_train,y_train)

print(model.score(X_test,y_test))

y_predict = model.predict(X_test)
confusion_matrix(y_test,y_predict)

0.7687861271676301


array([[93,  5],
       [35, 40]], dtype=int64)

By changing the iteration number, it seems like not much has changed. Maybe I get higher scores than the other model as I run the code several times, but it is hard t be sure by checking manually.

But comparing two models, there is definitely a problem when labelling 1 in both cases.

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.75)

model = LogisticRegression(max_iter=5500)
model.fit(X_train,y_train)

print(model.score(X_test,y_test))

y_predict = model.predict(X_test)
confusion_matrix(y_test,y_predict)

0.7745664739884393


array([[83,  8],
       [31, 51]], dtype=int64)

**Side note:** the number of 0's in y is higher than the 1's. Maybe that is the reason the models label 0's better than they label 1's.

In [74]:
print("length of y is",len(y))
print("there are",(y==0).sum(),"0's")
print("there are",(y==1).sum(),"1's")

length of y is 690
there are 383 0's
there are 307 1's


Highering the iteration number, and making the training size %50, this model is slightly better. Out of all the ones I have tried so far, this one seems the best.

But it may not always be wise to make the iteration number as high as it can get in real life scenarios. There of course will be a cost and we will tire the machine.

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.5)

model = LogisticRegression(max_iter=6000)
model.fit(X_train,y_train)

print(model.score(X_test,y_test))

y_predict = model.predict(X_test)
confusion_matrix(y_test,y_predict)

0.782608695652174


array([[170,  26],
       [ 49, 100]], dtype=int64)

# Q4.5
Running a code snippet several times in order to decide if that model is good, is not a very smart approach. Using the bootstrap function we can set certain limits such as iteration number and train size, and obtain a score range of the model. But we can't check confusion matrixes.

In [93]:
def bootstrap(X,y):
    res = []
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.75)
        model = LogisticRegression(max_iter=1500)
        model.fit(X_train,y_train)
        res.append(model.score(X_test,y_test))
    tmp = sorted(res)[3:97] # delete the %5, outliers.
    return (min(tmp),max(tmp))

In [94]:
bootstrap(X,y)

(0.7052023121387283, 0.8034682080924855)

In [95]:
def bootstrap2(X,y):
    res = []
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.5)
        model = LogisticRegression(max_iter=1500)
        model.fit(X_train,y_train)
        res.append(model.score(X_test,y_test))
    tmp = sorted(res)[3:97] # delete the %5, outliers.
    return (min(tmp),max(tmp))

In [96]:
bootstrap2(X,y)

(0.7275362318840579, 0.8)

In [97]:
def bootstrap3(X,y):
    res = []
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.5)
        model = LogisticRegression(max_iter=5500)
        model.fit(X_train,y_train)
        res.append(model.score(X_test,y_test))
    tmp = sorted(res)[3:97] # delete the %5, outliers.
    return (min(tmp),max(tmp))

In [99]:
bootstrap3(X,y)

(0.7188405797101449, 0.7913043478260869)

By declaring 3 versions of the bootstrap function I wanted to see how iteration and train size affects a model's score. Splitting train and test in half is slightly better than setting train as %75.

But it shocking to see that the lower number of iterations model gives a higher score. 

Now, I am not sure if my model stays as the best model. With the results of the bootstrap function the best model might be the one with the max_iter=1500 and train_size=0.75.