In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from xgboost import XGBRegressor 
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import plotly.graph_objects as go
from google.colab import drive

Accessing drive

In [None]:
drive.mount('/content/drive')

Reading data from the excel file

In [2]:
dfr = pd.read_excel(r'/content/drive/My Drive/CMIE_PROJECT/panel.xlsx', sheet_name='mainpanel')

Dropping irrelevant variables

In [3]:
dfr = dfr.drop(columns=['Total assets','Net fixed assets','Export / Sales (%)','IT/ITES & other professional services','Additions to computers and IT systems during the year','Gross software','Net profit margin','Net cash inflow or (outflow) from investing activities','Size','Total forex earnings / Total income (%)','Raw material imports / Raw material purchases (%)', 'Gross computers and IT systems'])

In [4]:
df=dfr.copy()

Filling missing values, linearly

In [44]:
def fill_nan_linearly(column):
    return column.interpolate(method='linear', limit_direction='both')
filled_df = df.groupby('Companies').apply(lambda group: group.apply(fill_nan_linearly, axis=0))
filled_df.reset_index(drop=True, inplace=True)

In [45]:
df=filled_df.copy()

In [47]:
comp=list(df['Companies'].unique())

Mathematical manipulation to avoid explosion in mape value

In [None]:
for i in df.columns:
  df.loc[df[i] == 0, i] = 0.1

Converting Companies to numeric strings

In [48]:
for i in range(len(comp)):
    df.loc[df['Companies'] == comp[i], 'Companies'] = str(i)

Splitting the Data to train(2010-2020) and test(2021-2023)

In [49]:
train = df[df['Year'] < 2020]
test = df[df['Year'] >= 2020]

In [50]:
train=pd.get_dummies(train, columns=['Companies'])
test=pd.get_dummies(test, columns=['Companies'])

In [51]:
X_train = train.drop(['Total income'], axis=1)
y_train = train['Total income']
X_test = test.drop(['Total income'], axis=1)
y_test = test['Total income']

ML Models

In [52]:
gbr = GradientBoostingRegressor()
ada=AdaBoostRegressor()
xgr=XGBRegressor()
rfr=RandomForestRegressor(max_depth=50, n_estimators=100)

In [53]:
gbr.fit(X_train, y_train)

In [73]:
xgr.fit(X_train, y_train)

In [57]:
ada.fit(X_train, y_train)

In [58]:
rfr.fit(X_train, y_train)

In [76]:
y_pred_tt_gbr= gbr.predict(X_test)
y_pred_tr_gbr=gbr.predict(X_train)

In [75]:
y_pred_tt_xgr= xgr.predict(X_test)
y_pred_tr_xgr=xgr.predict(X_train)

In [77]:
y_pred_tt_ada= ada.predict(X_test)
y_pred_tr_ada=ada.predict(X_train)

In [78]:
y_pred_tt_rfr= rfr.predict(X_test)
y_pred_tr_rfr=rfr.predict(X_train)

In [79]:
pred_tt=[y_pred_tt_gbr, y_pred_tt_xgr, y_pred_tt_ada, y_pred_tt_rfr]

In [80]:
pred_tr=[y_pred_tr_gbr, y_pred_tr_xgr, y_pred_tr_ada, y_pred_tr_rfr]

In [81]:
def mse(lis, y_tt_tr):
    temp=[]
    for i in lis:
        temp.append(root_mean_squared_error(y_tt_tr, i))
    return temp

def mae(lis, y_tt_tr):
    temp=[]
    for i in lis:
        temp.append(mean_absolute_error(y_tt_tr, i))
    return temp

def r2(lis, y_tt_tr):
    temp=[]
    for i in lis:
        temp.append(r2_score(y_tt_tr, i))
    return temp
                    
def mape(lis, y_tt_tr):
    temp=[]
    for i in lis:
        temp.append(mean_absolute_percentage_error(y_tt_tr, i))
    return temp

In [88]:
metrics={'reg':['gbr', 'xgr', 'ada', 'rfr'], 'mse_tr':mse(pred_tr, y_train), 'mse_tt':mse(pred_tt, y_test), 'r2_tr':r2(pred_tr, y_train), 'r2_tt':r2(pred_tt, y_test), 'mape_tr':mape(pred_tr, y_train), 'mape_tt':mape(pred_tt, y_test), 'mae_tr':mae(pred_tr, y_train), 'mae_tt':mae(pred_tt, y_test)}

In [89]:
metrics=pd.DataFrame(metrics)

In [90]:
metrics

Unnamed: 0,reg,mse_tr,mse_tt,r2_tr,r2_tt,mape_tr,mape_tt,mae_tr,mae_tt
0,gbr,1421.562966,13283.264127,0.998849,0.963029,29.895143,9.197143,400.659834,992.629114
1,xgr,1703.964592,21047.522726,0.998346,0.907177,23.890628,6.266225,433.848004,2185.182472
2,ada,21280.64901,25918.566716,0.742067,0.859241,617.831589,273.950868,18567.094468,18682.380156
3,rfr,1229.305949,14965.608026,0.999139,0.953071,10.918577,3.138309,136.244892,947.092959


In [85]:
test_lis=[]
for i in range(4):
    test_lis.append(list(pred_tr[i])+list(pred_tt[i]))


In [86]:
year_dat=list(train['Year'])+list(test['Year'])
for i in range(len(year_dat)):
    year_dat[i]=str(year_dat[i])

In [87]:
plot_df=[]
for i in range(4):
    pa=pd.DataFrame()
    pa['Prediction']=test_lis[i]
    pa['Year']=year_dat
    pa['Actual Values']=list(y_train)+list(y_test)
    plot_df.append(pa)


In [72]:
plot_lis=[]
title_lis=['Gradient Boost', 'XG Boost', 'AdaBoost', 'Random Forest']

for pa in plot_df:
    grouped_data = pa.groupby('Year').agg({'Prediction': 'sum', 'Actual Values': 'sum'}).reset_index()
    plot_lis.append(grouped_data)

# Create traces for Prediction and Actual Values
trace_gbr = go.Scatter(x=plot_lis[0]['Year'], y=plot_lis[0]['Prediction'], mode='lines', name='Gradient Boost', line=dict(color='red'))
trace_xgr = go.Scatter(x=plot_lis[1]['Year'], y=plot_lis[1]['Prediction'], mode='lines', name='XG Boost', line=dict(color='yellow'))
trace_ada = go.Scatter(x=plot_lis[2]['Year'], y=plot_lis[2]['Prediction'], mode='lines', name='Ada Boost', line=dict(color='blue'))
trace_rfr = go.Scatter(x=plot_lis[3]['Year'], y=plot_lis[3]['Prediction'], mode='lines', name='Random Forest', line=dict(color='purple'))

trace_actual = go.Scatter(x=plot_lis[0]['Year'], y=plot_lis[0]['Actual Values'], mode='lines', name='Actual Values', line=dict(color='green'))

# Create the figure and specify layout
fig = go.Figure([trace_gbr, trace_xgr, trace_ada, trace_rfr, trace_actual])
fig.update_layout(
    title='Predictions vs Actual Values',
    xaxis=dict(title='Year', tickfont=dict(size=14)),
    yaxis=dict(title='Total Income (in Million Rs.)', tickfont=dict(size=14)),
    legend=dict(font=dict(size=14))
)


In [41]:
for i in plot_lis:
    i.show()

AttributeError: 'DataFrame' object has no attribute 'show'

In [1]:
import pandas
dataf={'model':['LSTM', 'GRU'],
       'mse_tr':[1951.2408447959617,2034.962362911447],
       'mse_tt':[8376.165401754375,3991.3974320197044],
       'r2_tr':[0.9989151880004482,0.9988200430614701],
       'r2_tt':[0.9926227716945442,0.998497933082608],
       'mape_tr':[4.236100980971428,4.704764484524811],
       'mape_tt':[1.042334106585539,16.156899636521647],
       'mae_tr':[16.196777774804197,15.732498700471696],
       'mae_tt':[22.143294771270494,21.08064444834997]}
dataf=pandas.DataFrame(dataf)

In [2]:
dataf

Unnamed: 0,model,mse_tr,mse_tt,r2_tr,r2_tt,mape_tr,mape_tt,mae_tr,mae_tt
0,LSTM,1951.240845,8376.165402,0.998915,0.992623,4.236101,1.042334,16.196778,22.143295
1,GRU,2034.962363,3991.397432,0.99882,0.998498,4.704764,16.1569,15.732499,21.080644
