## PART 1

In [24]:
#Sanskar Singh
#19CH10047




# INITIALISING AND LOADING IN THE DATA

import scipy.io
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso

train_data = scipy.io.loadmat('2010rainfall.mat')
test_data = scipy.io.loadmat('2011rainfall.mat')
rainfall2010 = np.array(train_data['XR1'])
rainfall2011 = np.array(test_data['XR'])

In [25]:
#DEFINING IMPORTANT FUNCTIONS

#Function to create a feature set of desired properties as required by our problem
def train_set_creator(train_data,location):
    train_data_final = []
    train_data_update = np.delete(train_data, location, 0).T
    that_day = 3
    count = 0

    for train_row in train_data_update:
        time_cols = (train_data[location, (that_day-2):that_day])
        train_row = np.append(train_row,time_cols,0)
        train_data_final.append(train_row)
        if count>2:
            that_day += 1
        count += 0
    train_data_final = np.array(train_data_final)
    return train_data_final

#Function to evaluate mean squared error to judge the performance of our model
def mean_squared_error(prediction,actual):
    error_squared = 0
    total = 0
    for i in range(len(prediction)):
        error_squared += ((prediction[i]-actual[i])**2)
        total+=1
    mean = error_squared/total
    return mean
    
    
#Function to evaluate the most important features from a given list returns a dictionary with the keys as indices
def important_feature_finder(array,N):
    features = {}
    sorted_indices = np.argsort(array)
    for i in range(N):
        features[sorted_indices[-i]] = array[sorted_indices[-i]]
                 
    return features 

## PART 2

In [26]:
#TRAINING 3 DIFFERENT LINEAR REGRESSION MODELS TO PREDICT RAINFALL

#Mumbai Model Training 
train_Mumbai = train_set_creator(rainfall2010,42)
test_Mumbai = train_set_creator(rainfall2011,42)                  #Building the Model
regrMumbai = LinearRegression()                               
regrMumbai.fit(train_Mumbai , rainfall2010[42,:])
predictionM = regrMumbai.predict(test_Mumbai)                     #predictionM contains predicted rainfall for each day in Mumbai
MSE_Mumbai = mean_squared_error(rainfall2011[42,:],predictionM)   #Mean squared error in predictions

#Delhi Model Training
train_Delhi = train_set_creator(rainfall2010,158)
test_Delhi  = train_set_creator(rainfall2011,158)                  #Building the Model
regrDelhi = LinearRegression()                                     
regrDelhi.fit(train_Delhi, rainfall2010[158,:])                   
predictionD = regrDelhi.predict(test_Delhi)                         #predictionD contains the predicted rainfall for each day in Delhi
MSE_Delhi = (mean_squared_error(rainfall2011[158,:],predictionD))   #Mean squared error in predictions

#Kharagpur Model Training
train_Kharagpur = train_set_creator(rainfall2010,299)
test_Kharagpur = train_set_creator(rainfall2011,299)                  #Building the Model
regrKharagpur = LinearRegression()                                 
regrKharagpur.fit(train_Kharagpur, rainfall2010[299,:]) 
predictionKGP = regrKharagpur.predict(test_Kharagpur)               #predictionKGP contains the predicted rainfall for each day in Kharagpur
MSE_KGP = (mean_squared_error(rainfall2011[299,:],predictionKGP))   #Mean squared error in predictions
    

## PART 3

In [27]:
# Printing out the MSE values from our Linear Regression Models

print("The MSE for Mumbai is : \n")
print(MSE_Mumbai)
print("\nThe MSE for Delhi is : \n")
print(MSE_Delhi)
print("\nThe MSE for Kharagpur is : \n")
print(MSE_KGP)


The MSE for Mumbai is : 

688.3778703802776

The MSE for Delhi is : 

172.62977169745557

The MSE for Kharagpur is : 

239.75037413746443


## PART 4

In [28]:
#TRAINING 3 DIFFERENT LASSO REGRESSION MODELS TO PREDICT RAINFALL

lasso_reg1 = Lasso(alpha = 0.1, normalize=True, max_iter = 1e5)
lasso_reg2 = Lasso(alpha = 0.1, normalize=True, max_iter = 1e5)
lasso_reg3 = Lasso(alpha = 0.1, normalize=True, max_iter = 1e5)


lasso_Mumbai = lasso_reg1.fit(train_Mumbai,rainfall2010[42,:])                                 #Building the lasso regression model
lasso_predM =lasso_reg1.predict(train_Mumbai)                                                  #lasso_predM contains predictions for each day of rainfall in Mumbai
print("\n\nLasso MSE for Mumbai : ", mean_squared_error(lasso_predM,rainfall2011[42,:] ))    #Printing out the MSE values

lasso_Delhi = lasso_reg2.fit(train_Delhi,rainfall2010[158,:])
lasso_predD =lasso_reg2.predict(test_Delhi)
print("\n\nLasso MSE for Delhi : ", mean_squared_error(lasso_predD,rainfall2011[158,:] ))

lasso_KGP = lasso_reg3.fit(train_Kharagpur,rainfall2010[299,:])
lasso_predKGP =lasso_reg3.predict(test_Kharagpur)
print("\n\nLasso MSE for KGP : ", mean_squared_error(lasso_predKGP,rainfall2011[299,:] ))





Lasso MSE for Mumbai :  1615.1028498320413


Lasso MSE for Delhi :  68.46155648941063


Lasso MSE for KGP :  249.26852498359472


In [29]:
#Finding the most important loactions for determining the rain in these cities

Mumbaifeatures = important_feature_finder(lasso_Mumbai.coef_, 5)
print("\nThe most important locations for Mumbai are:\n ", Mumbaifeatures.keys())

Delhifeatures = important_feature_finder(lasso_Delhi.coef_, 5)
print("\nThe most important locations for Delhi are:\n ", Delhifeatures.keys())

KGPfeatures = important_feature_finder(lasso_KGP.coef_, 5)
print("\nThe most important locations for KGP are:\n ", KGPfeatures.keys())


The most important locations for Mumbai are:
  dict_keys([158, 26, 25, 124, 226])

The most important locations for Delhi are:
  dict_keys([256, 131, 292, 263, 53])

The most important locations for KGP are:
  dict_keys([0, 298, 210, 304, 217])


## PART 5

In [30]:
#TRAINING DECISION TREE MODEL TO PREDICT IF A DAY IS EITHER RAINY OR NOT


DTregressor = DecisionTreeRegressor()
DTtrain = np.array(train_data['ZR1'][0])
DTregressor.fit(rainfall2010.T, DTtrain)
y_pred = DTregressor.predict(rainfall2011.T)

print("\nThe most important locations for classifying a day rainy or not are given as follows(using Gini importance): \n")
print((important_feature_finder(DTregressor.feature_importances_,10)).keys())




The most important locations for classifying a day rainy or not are given as follows(using Gini importance): 

dict_keys([0, 184, 252, 160, 26, 204, 159, 150, 173, 25])


## PART 6

In [31]:
print("\nThe MSE determined using a decision tree classification is: \n")
print(mean_squared_error(y_pred, np.array(test_data['ZR'][0])))
print("\nThe accuracy of the prediction of our model is :\n")
print(str((1-mean_squared_error(y_pred, np.array(test_data['ZR'][0])))*100) + '%')





The MSE determined using a decision tree classification is: 

0.29508196721311475

The accuracy of the prediction of our model is :

70.49180327868852%
