#### Import libraries 

In [1]:
import pandas as pd 
import numpy as np 
from datetime import datetime 
import os 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
%matplotlib inline 

#### Mount google drive 

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Load train and test data 

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.shape

(909604, 16)

#### Calculate VIF values to check for multicollinearity ( This piece of code was a modified version of code snippet taken from Stackoverflow )

In [None]:
train_check = train_check.drop(['timestamp', 'Target', 'turbine_id'], axis=1)
corr_df = train_check.corr()
pd.Series(np.linalg.inv(train_check.corr().values).diagonal(), index==corr_df.index)

#### Utility functions 

In [4]:
def preprocess1(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['month'] = df['timestamp'].dt.month 
    df['day'] = df['timestamp'].dt.day
    df['hour'] = df['timestamp'].dt.hour 
    df['minute'] = df['timestamp'].dt.minute 
    
    df = df.drop(['timestamp'], axis=1)
    df = pd.get_dummies(df, columns=['turbine_id'])
    
    return df 

In [5]:
def preprocess_vif(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['month'] = df['timestamp'].dt.month 
    df['day'] = df['timestamp'].dt.day
    df['hour'] = df['timestamp'].dt.hour 
    df['minute'] = df['timestamp'].dt.minute 
   
    cols_to_drop = ['timestamp', 'active_power_calculated_by_converter', 'active_power_raw', 
                    'reactice_power_calculated_by_converter', 'reactive_power']
    df = df.drop(cols_to_drop, axis=1)
    df = pd.get_dummies(df, columns=['turbine_id'])
    
    return df 

#### All variables thrown into a linear regression model 

In [None]:
def gen_lr_model(df):
    X = df.drop(['Target'], axis=1)
    Y = df['Target'].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, test_size=0.25)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return model, mse 

In [None]:
train_preprocessed = preprocess1(train)
m1, score1 = gen_lr_model(train_preprocessed)
test_preprocessed = preprocess1(test)
test_pred = m1.predict(test_preprocessed)
pd.DataFrame({'Target':test_pred}).to_csv('m1.csv', index=False)
score1

#### All variables thrown into a random forest model 

In [6]:
def gen_random_forest_model(df,n_est):
    X = df.drop(['Target'], axis=1)
    Y = df['Target'].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, test_size=0.25)
    
    model = RandomForestRegressor(n_estimators=n_est, random_state=0)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return model, mse 

##### num_estimators = 10 

In [None]:
train_preprocessed = preprocess1(train)
m2, score2 = gen_random_forest_model(train_preprocessed, 10)
test_preprocessed = preprocess1(test)
test_pred = m2.predict(test_preprocessed)
pd.DataFrame({'Target':test_pred}).to_csv('m2.csv', index=False)
score2

0.24059807071732525

##### num_estimators = 50 

In [None]:
train_preprocessed = preprocess1(train)
m3, score3 = gen_random_forest_model(train_preprocessed, 50)
test_preprocessed = preprocess1(test)
test_pred = m3.predict(test_preprocessed)
pd.DataFrame({'Target':test_pred}).to_csv('m3.csv', index=False)
score3

0.20109517358694004

##### num_estimators = 100 

In [None]:
train_preprocessed = preprocess1(train)
m4, score4 = gen_random_forest_model(train_preprocessed, 100)
test_preprocessed = preprocess1(test)
test_pred = m4.predict(test_preprocessed)
pd.DataFrame({'Target':test_pred}).to_csv('sample_data/m4.csv', index=False)
score4

0.1972294202556921

##### num_estimators=50 ( VIF applied on 4 columns )

In [7]:
train_preprocessed = preprocess_vif(train)
test_preprocessed = preprocess_vif(test)

m8, score8 = gen_random_forest_model(train_preprocessed, 50)
test_pred = m8.predict(test_preprocessed)
pd.DataFrame({'Target':test_pred}).to_csv('m8.csv', index=False)
score8

0.19014907146466956

##### num_estimators=100 ( VIF applied on 4 columns )

In [8]:
train_preprocessed = preprocess_vif(train)
test_preprocessed = preprocess_vif(test)

m9, score9 = gen_random_forest_model(train_preprocessed, 100)
test_pred = m9.predict(test_preprocessed)
pd.DataFrame({'Target':test_pred}).to_csv('m9.csv', index=False)
score9 

0.1865892274846279