# Problem :
## Jaipur Rainfall Data
#### The problem is that Jaipur is one of the regions in India that has very limited amount of rainfall throught the year, So we will try to implement data analysis to predict what amount of rainfall will be recieved

In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os.path import join
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Location of Dataset
dataset = 'DataSet-Jaipur'
filename = 'JaipurRawData.csv'
filename = join(dataset, filename)

In [3]:
df = pd.read_csv(filename, index_col='date')
df.head()

Unnamed: 0_level_0,meantempm,meandewptm,meanpressurem,maxhumidity,minhumidity,maxtempm,mintempm,maxdewptm,mindewptm,maxpressurem,minpressurem,precipm
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-05-01,34,-1,1005.63,24,4,43,26,9,-10,1009,999,0.0
2016-05-02,36,4,1005.46,21,6,43,29,10,-2,1008,1001,0.0
2016-05-03,35,6,1006.0,27,5,41,29,12,-2,1009,1000,0.0
2016-05-04,34,7,1005.65,29,6,41,27,13,0,1008,1001,0.0
2016-05-05,31,11,1007.94,61,13,38,24,16,6,1011,1003,5.0


In [4]:
print('Shape of Dataset: {}'.format(df.shape))
print('Shape of Each Row: {}'.format(df.iloc[0].shape))
print('')

Shape of Dataset: (679, 12)
Shape of Each Row: (12,)



##### Pad Historic data for past 2 days with each row

In [5]:
def pad_nth_day_feature(df, feature, N):
    rows = df.shape[0]
    nth_prior_meassurements = [None]*N + [df[feature][i-N] for i in range(N, rows)]
    col_name = "{}_{}".format(feature, N)
    df[col_name] = nth_prior_meassurements

In [6]:
df.columns

Index(['meantempm', 'meandewptm', 'meanpressurem', 'maxhumidity',
       'minhumidity', 'maxtempm', 'mintempm', 'maxdewptm', 'mindewptm',
       'maxpressurem', 'minpressurem', 'precipm'],
      dtype='object')

In [7]:
for column in df.columns:
    if column != 'precipm':
        for n in range(1, 3):
            pad_nth_day_feature(df, column, n)

In [8]:
df.head()

Unnamed: 0_level_0,meantempm,meandewptm,meanpressurem,maxhumidity,minhumidity,maxtempm,mintempm,maxdewptm,mindewptm,maxpressurem,...,mintempm_1,mintempm_2,maxdewptm_1,maxdewptm_2,mindewptm_1,mindewptm_2,maxpressurem_1,maxpressurem_2,minpressurem_1,minpressurem_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-05-01,34,-1,1005.63,24,4,43,26,9,-10,1009,...,,,,,,,,,,
2016-05-02,36,4,1005.46,21,6,43,29,10,-2,1008,...,26.0,,9.0,,-10.0,,1009.0,,999.0,
2016-05-03,35,6,1006.0,27,5,41,29,12,-2,1009,...,29.0,26.0,10.0,9.0,-2.0,-10.0,1008.0,1009.0,1001.0,999.0
2016-05-04,34,7,1005.65,29,6,41,27,13,0,1008,...,29.0,29.0,12.0,10.0,-2.0,-2.0,1009.0,1008.0,1000.0,1001.0
2016-05-05,31,11,1007.94,61,13,38,24,16,6,1011,...,27.0,29.0,13.0,12.0,0.0,-2.0,1008.0,1009.0,1001.0,1000.0


In [9]:
# Changes in Shape
print('Shape of Dataset: {}'.format(df.shape))
print('Shape of Each Row: {}'.format(df.iloc[0].shape))

Shape of Dataset: (679, 34)
Shape of Each Row: (34,)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 679 entries, 2016-05-01 to 2018-03-11
Data columns (total 34 columns):
meantempm          679 non-null int64
meandewptm         679 non-null int64
meanpressurem      679 non-null float64
maxhumidity        679 non-null int64
minhumidity        679 non-null int64
maxtempm           679 non-null int64
mintempm           679 non-null int64
maxdewptm          679 non-null int64
mindewptm          679 non-null int64
maxpressurem       679 non-null int64
minpressurem       679 non-null int64
precipm            679 non-null float64
meantempm_1        678 non-null float64
meantempm_2        677 non-null float64
meandewptm_1       678 non-null float64
meandewptm_2       677 non-null float64
meanpressurem_1    678 non-null float64
meanpressurem_2    677 non-null float64
maxhumidity_1      678 non-null float64
maxhumidity_2      677 non-null float64
minhumidity_1      678 non-null float64
minhumidity_2      677 non-null float64
maxtempm_1         678 n

##### Doing Some Data Cleaning Operations

In [11]:
# Check if there is only one value in the column remove that feature
def check_uniqueness(dataframe):
    for column in dataframe.columns:
        if len(pd.Series.unique(dataframe[column])) == 1:
            dataframe.drop(column, inplace=True, axis=1)

            
    return dataframe

df = check_uniqueness(df)

In [12]:
# Drop Na Columns
df.dropna(inplace=True)

In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
meantempm,677.0,26.06647,6.213891,10.0,20.0,28.0,30.0,38.0
meandewptm,677.0,11.982275,8.731514,-10.0,5.0,10.0,21.0,26.0
meanpressurem,677.0,1009.505081,6.478314,997.63,1003.7,1008.85,1015.52,1023.43
maxhumidity,677.0,67.397341,22.05742,15.0,50.0,66.0,88.0,100.0
minhumidity,677.0,22.794682,19.367531,4.0,9.0,15.0,33.0,89.0
maxtempm,677.0,32.536189,5.992397,18.0,28.0,33.0,37.0,46.0
mintempm,677.0,19.644018,6.830276,3.0,13.0,22.0,25.0,32.0
maxdewptm,677.0,16.166913,7.237327,0.0,10.0,15.0,23.0,29.0
mindewptm,677.0,7.478582,11.686926,-94.0,0.0,6.0,18.0,25.0
maxpressurem,677.0,1012.251108,6.527288,999.0,1006.0,1012.0,1018.0,1026.0


### Preparing Data

Normalizig The Data

In [14]:
# def norm(x):
#     return (x - np.mean(x)) / np.std(x)

# df = df.apply(norm, axis=1)

For Precipitation Prediction

In [15]:
y_data = df['precipm']
x_data = df.drop(['precipm'], axis=1)

In [16]:
print('Shape of X: {}'.format(x_data.shape))
print('Shape of Y: {}'.format(y_data.shape))

Shape of X: (677, 33)
Shape of Y: (677,)


#### Split Train and Test Data


In [152]:
from sklearn.model_selection import train_test_split

In [153]:
# Split into Training and Test Set
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)

In [154]:
# Change them all to numpy array for faster computation
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [155]:
# Print Final Shapes of Sets
print('Training Set : X -> {}, Y -> {}'.format(x_train.shape, y_train.shape))
print('Testing Set: X -> {}, Y -> {}'.format(x_test.shape, y_test.shape))

Training Set : X -> (541, 33), Y -> (541,)
Testing Set: X -> (136, 33), Y -> (136,)


#### Now we have Training Set, Validation Set and Testing Set

##### 1. Applying Linear Regression

In [75]:
from sklearn.linear_model import LinearRegression

In [86]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [94]:
y_test_predicted = lin_reg.predict(x_test)

In [156]:
# Root Mean Square Error
rmse = np.round(np.sqrt(mean_squared_error(y_test,y_test_predicted)), 5)
print('Root Mean Square Error: {}'.format(rmse))

# R2 Score
r2 = np.round(r2_score(y_test,y_test_predicted), 5)
print('R2 Score : {}'.format(r2))


NameError: name 'y_test_predicted' is not defined

##### 2. Fitting Polynomial Regression

In [139]:
from sklearn.preprocessing import PolynomialFeatures

In [149]:
polynomial_features= PolynomialFeatures(degree=5)
x_train_poly = polynomial_features.fit_transform(x_train)
x_test_poly = polynomial_features.fit_transform(x_test)
lin_reg = LinearRegression()
lin_reg.fit(x_train_poly, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [150]:
y_test_poly_predicted = lin_reg.predict(x_test_poly)

In [151]:
# Root Mean Square Error
rmse = np.round(np.sqrt(mean_squared_error(y_test, y_test_poly_predicted)), 5)
print('Root Mean Square Error: {}'.format(rmse))

# R2 Score
r2 = np.round(r2_score(y_test,y_test_poly_predicted), 5)
print('R2 Score : {}'.format(r2))


Root Mean Square Error: 25.51118
R2 Score : -38.36522
