# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importing data files

In [None]:
import seaborn as sns
sns.set_theme()
plt.rcParams["figure.figsize"] = (15,10)

files = ["Delhi",
         "East_20", "East_100", "East_500",
         "North_20", "North_100","North_500",
         "South_20", "South_100","South_500",
         "West_20", "West_100", "West_500"]
#List containing the data for different cities
data = []

for i in range(len(files)):
    data_ = pd.read_csv("data/" + files[i] + ".csv")

    if i != 0:
        data_.pop("date")
    else:
        rs = np.random.RandomState(0)
        corr = data_.corr()
        sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
                    annot=True)
    data.append(data_)

# Preprocessor class that preprocess the data

In [None]:
class PreProcessor:
#days denote size of the window
    def __init__(self, data, attr, cutoff,  days = 1):
        # X and Y data
        self.X = np.zeros((data[0].shape[0] - days, ((data[0].shape[1] - 1)*len(data))*days))
        self.Y = np.zeros((self.X.shape[0], 1))
        
        #Concatination of cities data;
        data_cities = data[0]
        for i_ in range(1, len(data)):
            data_cities = pd.concat([data_cities, data[i_]], axis = 1)
        
        data_cities = data_cities.to_numpy()
        date_ = data[0]["date"]
        date = [[], [], []]
        
        #Data filing
        for i_ in range(0, self.X.shape[0]):
            for offset in range(0, days):
                index = (offset * (data_cities.shape[1] - 1))
                self.X[i_, index: index + data_cities.shape[1] - 1] = data_cities[i_ + offset, 1:]
            self.Y[i_, 0] = data[0][attr][i_+days]
            day = int(date_[i_][ : date_[i_].index("-")])
            month = int(date_[i_][date_[i_].index("-") + 1 : date_[i_].rindex("-")])
            year = int(date_[i_][date_[i_].rindex("-") + 1 : ])
            date[0].append(day); date[1].append(month); date[2].append(year)
        
        self.X = np.append(self.X, np.array(date).T, axis = 1)
        self.Y[self.Y <= cutoff] = 0
        self.Y[self.Y > cutoff] = 1
        
        self.standardize_data()
    
    #Standardizing the data
    def standardize_data(self):
        mean = np.mean(self.X, axis = 0)
        std = np.std(self.X, axis = 0)
        std[std == 0] = 1
        self.X = self.X - mean
        self.X = self.X / std
        
    #Splitting the data
    #def split(self):

def coeff_printer(file_order, given_model, data, days, consider_date=False):
    index = 0
    if consider_date:
        print("year:", given_model.coef_[index])
        index += 1
        print("year_day:", given_model.coef_[index])
        index += 1

    for day in range(days, 0, -1):
        for file in file_order:
            for (attr, val) in data[0].iteritems():
                if attr != "date":
                    str_coef = f"{round(given_model.coef_[index], 4)}"
                    print(f"Before: {str(day):4s} | file: {file:10s} | attr: {attr:40s}",
                          f":{str_coef:>10s}")
                    index += 1

In [None]:
#Here we have changed the value of days and then train different models for that data 
obj = PreProcessor(data, "Total Precipitation (MM)", 2.5, days = 10)

In [None]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split

# Splitting up the data into train and test

x_train, x_test, y_train, y_test = train_test_split(obj.X, obj.Y, test_size=0.20, random_state=200)


## Logistic regression 

In [None]:
clf = LogisticRegressionCV(max_iter = 100000).fit(x_train, np.ravel(y_train))
y_hat = clf.predict(x_train)


## Defining score function

In [None]:
from sklearn.metrics import recall_score,precision_score,accuracy_score,plot_roc_curve
def score(y, y_hat):
    print("Recall is",recall_score(y, y_hat))
    print("Accuracy is",accuracy_score(y, y_hat))
    print("Precision is",precision_score(y, y_hat))

def give_y(y, threshold):
    y_hat = np.copy(y[:,1])
    y_hat[y_hat >= threshold] = 1
    y_hat[y_hat < threshold] = 0
    return y_hat

## Roc curve

In [None]:
from sklearn import datasets, metrics, model_selection, svm
metrics.plot_roc_curve(clf, x_test, y_test)

In [None]:
#Threshold set at 0.5
y_hat1 = clf.predict(x_test)
score(y_test, y_hat1)
y_hat2 = clf.predict(x_train)
score(y_train, y_hat2)

## Random Forest 

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state = 0, max_depth = 13)
model.fit(x_train, np.ravel(y_train))
y_hat1 = model.predict(x_train)
score(y_train, y_hat1)
y_hat2 = model.predict(x_test)
score(y_test, y_hat2)

## AdaBoost 

In [None]:
#Adaboost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
model = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), n_estimators=100, random_state=0)
model.fit(x_train, np.ravel(y_train))
y_hat1 = model.predict(x_train)
score(y_train, y_hat1)
y_hat2 = model.predict(x_test)
score(y_test, y_hat2)

## MLP 

In [None]:
#MLP
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(alpha = 0.001,random_state=1,validation_fraction=0.1 ,hidden_layer_sizes=(128, 64, 32, 16), max_iter=10000).fit(x_train, np.ravel(y_train))
y_hat1 = model.predict(x_train)
y_hat2 = model.predict(x_test)
score(y_train, y_hat1)
score(y_test, y_hat2)

## Support Vector Machine 

In [None]:
from sklearn.svm import SVC
model = SVC()
model.fit(x_train, np.ravel(y_train))
y_hat1 = model.predict(x_train)
y_hat2 = model.predict(x_test)
score(y_train, y_hat1)
score(y_test, y_hat2)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=9)
model.fit(x_train, np.ravel(y_train))
y_hat1 = model.predict(x_train)
y_hat2 = model.predict(x_test)
score(y_train, y_hat1)
score(y_test, y_hat2)

In [None]:
accuracy_train = []
precision_train = []
recall_train = []
accuracy_test = []
precision_test = []
recall_test = []
for i in range(2,13):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(x_train, np.ravel(y_train))
    y_hat = model.predict(x_test)
    accuracy_test.append(accuracy_score(y_test, y_hat))
    recall_test.append(recall_score(y_test, y_hat))
    precision_test.append(precision_score(y_test, y_hat))
    y_hat1 = model.predict(x_train)
    accuracy_train.append(accuracy_score(y_train, y_hat1))
    recall_train.append(recall_score(y_train, y_hat1))
    precision_train.append(precision_score(y_train, y_hat1))

In [None]:
plt.plot(np.array(range(2,13)), accuracy_train)
plt.ylabel("Accuracy")
plt.xlabel("k")
plt.rc('grid', linestyle=":", color='black')
plt.show()