In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2s
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [2]:
inflation = pd.read_csv('Additional.csv', index_col="Period")
inflation.head()

Unnamed: 0_level_0,Yield Curve,Production Index,Housing Starts,Cost of Living Index,Unemployment,Business Confidence,Producer Price Index,Export Price Index,Import Price Index,Basic Price Index for Domestic Goods,USD,JPY,GBP,CNY,OMX Helsinki,Consumer Price Index
Period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Jan.00,0.62,96.01601,118.4,1466,10.1,13,91.1,110.2,85.9,79.1,1.0137,106.53,0.61834,8.3926,14364.0,98.0
Feb.00,0.53,96.31606,118.2,1476,10.2,20,92.9,113.3,87.2,79.9,0.9834,107.64,0.61466,8.1408,15864.0,98.8
Mar.00,0.45,95.51592,118.4,1485,10.2,17,93.5,114.4,87.9,80.3,0.9643,102.59,0.61063,7.9834,17092.0,99.3
Apr.00,0.22,97.01617,119.3,1490,10.0,15,93.9,115.1,87.4,80.2,0.947,99.92,0.59802,7.8402,15799.0,99.6
May.00,0.11,101.2169,120.5,1497,9.8,19,93.8,114.6,89.2,81.0,0.906,98.09,0.60151,7.4996,16344.0,99.9


In [3]:
inflation['OMX Helsinki'] = pd.to_numeric(inflation['OMX Helsinki']).astype(float)
inflation.info()

<class 'pandas.core.frame.DataFrame'>
Index: 277 entries, Jan.00 to Jan.23
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Yield Curve                           277 non-null    float64
 1   Production Index                      277 non-null    float64
 2   Housing Starts                        277 non-null    float64
 3   Cost of Living Index                  277 non-null    int64  
 4   Unemployment                          277 non-null    float64
 5   Business Confidence                   277 non-null    int64  
 6   Producer Price Index                  277 non-null    float64
 7   Export Price Index                    277 non-null    float64
 8   Import Price Index                    277 non-null    float64
 9   Basic Price Index for Domestic Goods  277 non-null    float64
 10  USD                                   277 non-null    float64
 11  JPY             

Unoptimized Decision Tree

In [5]:
X = inflation.drop('Consumer Price Index', axis=1)
y = inflation['Consumer Price Index']


#creating the x and y value for the dataset and creating a test and training set from them

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
scaler = MinMaxScaler()
scaler.fit (X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

inflation_tree = DecisionTreeRegressor(random_state=42)

inflation_tree.fit(X_train_scaled, y_train)

y_pred_decision = inflation_tree.predict(X_test_scaled)
mse_tree = mse(y_test, y_pred_decision)
mae_tree = mae(y_test, y_pred_decision)
rsq_tree = r2s(y_test, y_pred_decision)
print ('MSE of Decision Tree: ', mse_tree)
print ('MAE of Decision Tree', mae_tree)
print ('R-Squared of Decision Tree', rsq_tree)


MSE of Decision Tree:  0.3558571428571424
MAE of Decision Tree 0.3414285714285709
R-Squared of Decision Tree 0.9978083516445174


In [6]:
print('Depth of tree:', inflation_tree.tree_.max_depth)
print('Number of nodes in tree:', inflation_tree.tree_.node_count)

Depth of tree: 10
Number of nodes in tree: 299


In [7]:
param_dt = {'max_depth': [2, 4, 6, 8, 10], 'min_samples_leaf': [1, 2, 5, 10, 15, 20, 25]}
grid_dt = GridSearchCV(estimator=inflation_tree, param_grid=param_dt, scoring='neg_mean_squared_error')
grid_result = grid_dt.fit(X_train_scaled, y_train)
print ('Best Param:', grid_result.best_params_ )

Best Param: {'max_depth': 8, 'min_samples_leaf': 2}


Optimized for New Variables for Decision Tree

In [8]:
inflation_tree = DecisionTreeRegressor(random_state=42, max_depth=8, min_samples_leaf=1)

inflation_tree.fit(X_train_scaled, y_train)

y_pred_decision = inflation_tree.predict(X_test_scaled)
mse_tree = mse(y_test, y_pred_decision)
mae_tree = mae(y_test, y_pred_decision)
rsq_tree = r2s(y_test, y_pred_decision)
print ('MSE of Decision Tree: ', mse_tree)
print ('MAE of Decision Tree', mae_tree)
print ('R-Squared of Decision Tree', rsq_tree)


MSE of Decision Tree:  0.7190992063492055
MAE of Decision Tree 0.44899999999999846
R-Squared of Decision Tree 0.9955712211356209


Optimization from Initial Dataset for Decision Tree

In [9]:
inflation_tree = DecisionTreeRegressor(random_state=42, max_depth=10, min_samples_leaf=1)

inflation_tree.fit(X_train_scaled, y_train)

y_pred_decision = inflation_tree.predict(X_test_scaled)
mse_tree = mse(y_test, y_pred_decision)
mae_tree = mae(y_test, y_pred_decision)
rsq_tree = r2s(y_test, y_pred_decision)
print ('MSE of Decision Tree: ', mse_tree)
print ('MAE of Decision Tree', mae_tree)
print ('R-Squared of Decision Tree', rsq_tree)


MSE of Decision Tree:  0.3558571428571424
MAE of Decision Tree 0.3414285714285709
R-Squared of Decision Tree 0.9978083516445174


Unoptimized Random Forest

In [None]:

inflation_random = RandomForestRegressor(n_estimators=200, random_state=0)
inflation_random.fit(X_train_scaled, y_train)
y_pred_rand = inflation_random.predict(X_test_scaled)
mse_rand = mse(y_test, y_pred_rand)
mae_rand = mae(y_test, y_pred_rand)
r2 = r2s(y_test, y_pred_rand)
print ('MSE of Random Forest:',mse_rand )
print ('MAE of Random Forest:',mae_rand )
print ('R-squared', r2)

MSE of Random Forest: 0.20444984285718612
MAE of Random Forest: 0.26469999999999955
R-squared 0.9987408369598008


In [None]:
grid = dict(max_depth = [1, 5, 10, 15], min_samples_leaf = [1, 5, 10, 15, 20])

forest_est = RandomForestRegressor(n_estimators = 200, random_state = 42)

grid_search = GridSearchCV(estimator=inflation_random, param_grid=grid, scoring='neg_mean_squared_error')

grid_result = grid_search.fit(X, y)

# Print out the best result
print("Best result is obtained using", grid_result.best_params_)

Best result is obtained using {'max_depth': 15, 'min_samples_leaf': 1}


Optimized Random Forest for New Dataset

In [None]:
inflation_random = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=10, min_samples_leaf=1)
inflation_random.fit(X_train, y_train)
y_pred_rand = inflation_random.predict(X_test)
mse_rand = mse(y_test, y_pred_rand)
mae_rand = mae(y_test, y_pred_rand)
r2 = r2s(y_test, y_pred_rand)
print ('MSE of Random Forest:',mse_rand )
print ('MAE of Random Forest:',mae_rand )
print ('R-squared', r2)

MSE of Random Forest: 0.20119588200003288
MAE of Random Forest: 0.25219571428572163
R-squared 0.9987608774117195


Optimized Random Forest from Initial Dataset

In [23]:
inflation_random = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=15, min_samples_leaf=1)
inflation_random.fit(X_train, y_train)
y_pred_rand = inflation_random.predict(X_test)
mse_rand = mse(y_test, y_pred_rand)
mae_rand = mae(y_test, y_pred_rand)
r2 = r2s(y_test, y_pred_rand)
print ('MSE of Random Forest:',mse_rand )
print ('MAE of Random Forest:',mae_rand )
print ('R-squared', r2)

MSE of Random Forest: 0.19576792142859478
MAE of Random Forest: 0.26337142857144347
R-squared 0.9987943070648789


Unoptimized LASSO

In [30]:
from sklearn.linear_model import Lasso

inflation_lasso = Lasso(alpha=0.1, random_state=42)
inflation_lasso.fit(X_train_scaled, y_train)
y_pred_lasso = inflation_lasso.predict(X_test_scaled)
mse_lasso = mse(y_test, y_pred_lasso)
mae_lasso = mae(y_test, y_pred_lasso)
r2_lasso = r2s(y_test, y_pred_lasso)
print ('MSE of LASSO:',mse_lasso )
print ('MAE of LASSO Forest:',mae_lasso )
print ('R-Squared of Lasso', r2_lasso)


MSE of LASSO: 0.25190844904734055
MAE of LASSO Forest: 0.43970521711984534
R-Squared of Lasso 0.9984485495115988


Optimized LASSO for Original Dataset

In [40]:
#After manual tuning
from sklearn.linear_model import Lasso

inflation_lasso = Lasso(alpha=0.001, random_state=42)
inflation_lasso.fit(X_train_scaled, y_train)
y_pred_lasso = inflation_lasso.predict(X_test_scaled)
mse_lasso = mse(y_test, y_pred_lasso)
mae_lasso = mae(y_test, y_pred_lasso)
r2_lasso = r2s(y_test, y_pred_lasso)
print ('MSE of LASSO:',mse_lasso )
print ('MAE of LASSO Forest:',mae_lasso )
print ('R-Squared of Lasso', r2_lasso)


MSE of LASSO: 0.007651088626041924
MAE of LASSO Forest: 0.042575130452416336
R-Squared of Lasso 0.9999528785746149


Unoptimized SVR

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Extract the X and Y columns
X = inflation.drop('Consumer Price Index', axis=1)
y = inflation['Consumer Price Index']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale the X data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create an SVR model with chosen hyperparameters
svr = SVR(kernel='linear', C=1, epsilon=0.01)

# Train the SVR model on the training data
svr.fit(X_train_scaled, y_train)

# Evaluate the performance of the model on the testing data
y_pred = svr.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print('MAE:', mae)
print('R-squared:', r2)


MSE: 2.479701712622173
MAE: 1.2756012057733819
R-squared: 0.9836990984636363


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the dataset
data = inflation

# Extract the X and Y columns
X = inflation.drop('Consumer Price Index', axis=1)
y = inflation['Consumer Price Index']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale the X data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a dictionary of hyperparameters and their possible values
param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 2, 3, 4, 5, 10], 'epsilon': [0.01, 0.1, 1]}

# Create an SVR model
svr = SVR()

# Use GridSearchCV to find the best combination of hyperparameters
grid_search = GridSearchCV(svr, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print (best_params)
print (best_model)


{'C': 10, 'epsilon': 0.01, 'kernel': 'linear'}
SVR(C=10, epsilon=0.01, kernel='linear')


Optimized for New Dataset

In [None]:
svr = SVR (C=50, epsilon=0.01, kernel='linear')

# Train the SVR model on the training data
svr.fit(X_train_scaled, y_train)

# Evaluate the performance of the model on the testing data
y_pred = svr.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print('MAE:', mae)
print('R-squared:', r2)



MSE: 0.009146208037214203
MAE: 0.05205595167807617
R-squared: 0.9999398752535893


Optimized SVR for Original Dataset

In [None]:
svr = SVR (C=20, epsilon=0.0341, kernel='linear')

# Train the SVR model on the training data
svr.fit(X_train_scaled, y_train)

# Evaluate the performance of the model on the testing data
y_pred = svr.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MSE:', mse)
print('MAE:', mae)
print('R-squared:', r2)



MSE: 0.013310750355821308
MAE: 0.0768395063064267
R-squared: 0.9999124986566647


In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2s
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [None]:

data = pd.read_csv('Additional.csv', index_col='Period')
data.head()

NameError: name 'pd' is not defined

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 277 entries, Jan.00 to Jan.23
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Yield Curve                           277 non-null    float64
 1   Production Index                      277 non-null    float64
 2   Housing Starts                        277 non-null    float64
 3   Cost of Living Index                  277 non-null    int64  
 4   Unemployment                          277 non-null    float64
 5   Business Confidence                   277 non-null    int64  
 6   Producer Price Index                  277 non-null    float64
 7   Export Price Index                    277 non-null    float64
 8   Import Price Index                    277 non-null    float64
 9   Basic Price Index for Domestic Goods  277 non-null    float64
 10  USD                                   277 non-null    float64
 11  JPY             

In [None]:
X = data.drop('Consumer Price Index', axis=1)
y = data['Consumer Price Index']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

scaler = MinMaxScaler()
scaler.fit (X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [None]:
param_grid = {'learning_rate': [0.005, 0.0005, 0.05, 0.1, 0.15],
              'max_depth': [1, 2, 3, 4, 5],
              'min_child_weight': [1, 2, 3, 4, 5]}


Unoptimized XG Boost

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000) 
xgb_model.fit(X_train_scaled, y_train)
y_pred_xg = xgb_model.predict(X_test_scaled)
mse_xg = mse(y_test, y_pred_xg )
mae_xg = mae(y_test, y_pred_xg )
rsq_xg = r2s(y_test, y_pred_xg )
print('MSE: ', mse_xg)
print ('MAE: ', mae_xg)
print ('R-squared: ', rsq_xg)

MSE:  0.415730045874669
MAE:  0.4601597813197537
R-squared:  0.9974396071860452


In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)


In [None]:
grid_search.fit(X_train_scaled, y_train)


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, m...
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=None, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None,
       

In [None]:
print('Best hyperparameters:', grid_search.best_params_)
print('Best mean squared error:', -grid_search.best_score_)


Best hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1}
Best mean squared error: 0.12879385858217382


Optimized XG Boost for New Dataset

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05, max_depth=3, min_child_weight=1)
xgb_model.fit(X_train_scaled, y_train)

NameError: name 'xgb' is not defined

In [None]:
y_pred_xg = xgb_model.predict(X_test_scaled)
mse_xg = mse(y_test, y_pred_xg )
mae_xg = mae(y_test, y_pred_xg )
rsq_xg = r2s(y_test, y_pred_xg )
print('MSE: ', mse_xg)
print ('MAE: ', mae_xg)
print ('R-squared: ', rsq_xg)

MSE:  0.09012867526014887
MAE:  0.1977616446358819
R-squared:  0.999444916683898


Optimized SVR for Original Dataset

In [None]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=4000, learning_rate=0.005, max_depth=20, min_child_weight=1)
xgb_model.fit(X_train_scaled, y_train)
y_pred_xg = xgb_model.predict(X_test_scaled)
mse_xg = mse(y_test, y_pred_xg )
mae_xg = mae(y_test, y_pred_xg )
rsq_xg = r2s(y_test, y_pred_xg )
print('MSE: ', mse_xg)
print ('MAE: ', mae_xg)
print ('R-squared: ', rsq_xg)

MSE:  0.07003671016391108
MAE:  0.1549116298130581
R-squared:  0.9995686588179129


Unoptimized MLP

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2s




# Train the artificial neural network
model = MLPRegressor(hidden_layer_sizes=(50, 50), activation='relu',max_iter=500, alpha=0.01, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict the Consumer Price Index using the trained model
y_pred_mlp = model.predict(X_test_scaled)

# Calculate the Mean Squared Error
mse_mlp = mse(y_test, y_pred_mlp)
mae_mlp = mae(y_test, y_pred_mlp)
rsq_mlp = r2s(y_test, y_pred_mlp)

# Print the Mean Squared Error
print('Mean Squared Error: ', mse_mlp)
print('Mean Absolute Error: ', mae_mlp)
print('R-squared: ', rsq_mlp)



Mean Squared Error:  73.77729708704925
Mean Absolute Error:  6.7491086082453435
R-squared:  0.545621339691102


#Optimization Performed on Google Colab due to resource constraint

mlp = MLPRegressor()
param_grid = {
'hidden_layer_sizes': [(10,), (50,), (100,), (10,10), (50,50), (100,100)],
'activation': ['logistic', 'tanh', 'relu'],
'solver': ['lbfgs', 'adam'],
'learning_rate': ['constant', 'adaptive'],
'max_iter': [1000, 2000, 3000, 4000, 5000]
}
grid_mlp = GridSearchCV(mlp, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_mlp.fit(X_train_scaled, y_train)

print("Best hyperparameters: ", grid_search.best_params_)

Optimized MLP for New Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2s


# Train the artificial neural network
model = MLPRegressor(hidden_layer_sizes=(100, 100), activation='relu', solver='lbfgs', max_iter=5000, learning_rate='adaptive', alpha=0.01, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict the Consumer Price Index using the trained model
y_pred_mlp = model.predict(X_test_scaled)

# Calculate the Mean Squared Error
mse_mlp = mse(y_test, y_pred_mlp)
mae_mlp = mae(y_test, y_pred_mlp)
rsq_mlp = r2s(y_test, y_pred_mlp)

# Print the Mean Squared Error
print('Mean Squared Error: ', mse_mlp)
print('Mean Absolute Error: ', mae_mlp)
print('R-squared: ', rsq_mlp)



Mean Squared Error:  0.005300868053561705
Mean Absolute Error:  0.041139113347481424
R-squared:  0.9999673530825911


Optimized MLP for Original Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score as r2s


# Train the artificial neural network
model = MLPRegressor(hidden_layer_sizes=(50, 50), activation='relu', solver='lbfgs', max_iter=5000, learning_rate='constant', alpha=0.01, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict the Consumer Price Index using the trained model
y_pred_mlp = model.predict(X_test_scaled)

# Calculate the Mean Squared Error
mse_mlp = mse(y_test, y_pred_mlp)
mae_mlp = mae(y_test, y_pred_mlp)
rsq_mlp = r2s(y_test, y_pred_mlp)

# Print the Mean Squared Error
print('Mean Squared Error: ', mse_mlp)
print('Mean Absolute Error: ', mae_mlp)
print('R-squared: ', rsq_mlp)



Mean Squared Error:  0.004281388790105986
Mean Absolute Error:  0.04059493730518824
R-squared:  0.9999736318382548
