
# Load Data, Drop Unecessary Information

In [1]:
import pandas as pd
import os

In [2]:
def create_complete_df(num_files):
    path_to_features  = '/content/drive/MyDrive/SOC/Data/'

    list_of_dfs = []
    for i in range(num_files):
        csv_file = f'TOAFastFeatures{i + 1}.csv'
        print(csv_file)
        df = pd.read_csv(os.path.join(path_to_features, csv_file))
        list_of_dfs.append(df)

    df = pd.concat(list_of_dfs, axis=0)
    return df

In [3]:
df = create_complete_df(1)

TOAFastFeatures1.csv


In [4]:
df.reset_index(inplace = True)

In [5]:
df.drop(['index'], axis = 1, inplace = True)

# Iterative Imputer

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(verbose=2, max_iter=20)

imputed_df = pd.DataFrame(imputer.fit_transform(df), columns = df.columns)

# Merge HWSD Data

In [7]:
HWSD = pd.read_csv('/content/drive/MyDrive/SOCData/HWSD_DATA.csv')
HWSD = HWSD[['MU_GLOBAL', 'S_OC']]
HWSD

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,MU_GLOBAL,S_OC
0,7001,
1,7002,
2,7003,
3,7004,32.89
4,7005,
...,...,...
48143,32049,0.13
48144,32049,
48145,32050,
48146,32050,0.13


In [8]:
HWSD = HWSD.groupby('MU_GLOBAL').mean()

In [9]:
new = pd.merge(imputed_df, HWSD, on="MU_GLOBAL")
new = new.dropna()

# Split Data

In [10]:
y = new.pop('S_OC')
X = new

In [11]:
assert 'S_OC' not in X.columns

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

In [13]:
assert len(X_train) == len(y_train)

In [14]:
assert len(X_test) == len(y_test)

# Feature Standardization

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

# Decision Tree Regressor

In [16]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

DecisionTreeRegressor()

# Random Forest Regressor

In [17]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

RandomForestRegressor()

# XGBoost

In [18]:
from xgboost import XGBRegressor

X_train_xgb, X_val, y_train_xgb, y_val = train_test_split(X_train, y_train)

xgb = XGBRegressor(n_estimators=1500, learning_rate=0.01, eval_metric="rmse", early_stopping_rounds=10)
xgb.fit(X_train, y_train, verbose =100)



XGBRegressor(early_stopping_rounds=10, eval_metric='rmse', learning_rate=0.01,
             n_estimators=1500)

# SVM

In [19]:
from sklearn.svm import SVR

svr = SVR(C=1.0, epsilon=0.2)
svr.fit(X_train, y_train)

SVR(epsilon=0.2)

# LGBM

In [20]:
import lightgbm

hyper_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.005,
    'verbose': -1,
    'n_estimators': 1000,
    'random_state' : 0
}

lgbm = lightgbm.LGBMRegressor(**hyper_params)

lgbm.fit(X_train_xgb,
         y_train_xgb,
         eval_set = [(X_val, y_val)],
         callbacks = [lightgbm.early_stopping(stopping_rounds = 20)],
         verbose = 100
)

Training until validation scores don't improve for 20 rounds.
[100]	valid_0's rmse: 0.926856
[200]	valid_0's rmse: 0.90564
[300]	valid_0's rmse: 0.900741
Early stopping, best iteration is:
[334]	valid_0's rmse: 0.899897


LGBMRegressor(learning_rate=0.005, metric='rmse', n_estimators=1000,
              objective='regression', random_state=0, verbose=-1)

# RMSE

In [21]:
from sklearn.metrics import mean_squared_error

models_list = {
    "Decision Tree" : dt,
    "Random Forest" : rf,
    "XGBoost" : xgb,
    "SVR" : svr,
    "LGBM" : lgbm,
}

for name, model in models_list.items():
    print(name + ": " + str(round(mean_squared_error(model.predict(X_test), y_test, squared = False), 5)))

Decision Tree: 2.11777
Random Forest: 1.84182
XGBoost: 2.39853
SVR: 1.70894
LGBM: 1.66142
