In [1]:
import numpy as np 
import pandas as pd 
from sklearn.metrics import mean_squared_error

In [2]:
dataset = pd.read_csv(("../input/rainfall/Sub_Division_IMD_2017.csv"), encoding = "ISO-8859-1")
dataset

Unnamed: 0,SUBDIVISION,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANNUAL,JF,MAM,JJAS,OND
0,Andaman & Nicobar Islands,1901,49.2,87.1,29.2,2.3,528.8,517.5,365.1,481.1,332.6,388.5,558.2,33.6,3373.2,136.3,560.3,1696.3,980.3
1,Andaman & Nicobar Islands,1902,0.0,159.8,12.2,0.0,446.1,537.1,228.9,753.7,666.2,197.2,359.0,160.5,3520.7,159.8,458.3,2185.9,716.7
2,Andaman & Nicobar Islands,1903,12.7,144.0,0.0,1.0,235.1,479.9,728.4,326.7,339.0,181.2,284.4,225.0,2957.4,156.7,236.1,1874.0,690.6
3,Andaman & Nicobar Islands,1904,9.4,14.7,0.0,202.4,304.5,495.1,502.0,160.1,820.4,222.2,308.7,40.1,3079.6,24.1,506.9,1977.6,571.0
4,Andaman & Nicobar Islands,1905,1.3,0.0,3.3,26.9,279.5,628.7,368.7,330.5,297.0,260.7,25.4,344.7,2566.7,1.3,309.7,1624.9,630.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4183,Lakshadweep,2013,26.2,34.4,37.5,5.3,88.3,426.2,296.4,154.4,180.0,72.8,78.1,26.7,1426.3,60.6,131.1,1057.0,177.6
4184,Lakshadweep,2014,53.2,16.1,4.4,14.9,57.4,244.1,116.1,466.1,132.2,169.2,59.0,62.3,1395.0,69.3,76.7,958.5,290.5
4185,Lakshadweep,2015,2.2,0.5,3.7,87.1,133.1,296.6,257.5,146.4,160.4,165.4,231.0,159.0,1642.9,2.7,223.9,860.9,555.4
4186,Lakshadweep,2016,59.6,12.1,3.2,2.6,77.4,321.1,262.6,86.2,75.6,58.6,32.0,74.7,1065.7,71.7,83.2,745.4,165.4


In [3]:
# Grouping By Subdivisions
groups = dataset.groupby('SUBDIVISION')['YEAR','JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','NOV','DEC']

# Creating a North Interior Karnataka focused Datafram
data=groups.get_group(('North Interior Karnataka'))
data=data.melt(['YEAR']).reset_index()
df= data[['YEAR','variable','value']].reset_index().sort_values(by=['YEAR','index'])
df.columns=['INDEX','YEAR','Month','avg_rainfall']

# Encoding Months
d={'JAN':1,'FEB':2,'MAR' :3,'APR':4,'MAY':5,'JUN':6,'JUL':7,'AUG':8,'SEP':9,
   'OCT':10,'NOV':11,'DEC':12}
df['Month']=df['Month'].map(d)

  


In [4]:
# Adding a date column
df['Date']=pd.to_datetime(df.assign(Day=1).loc[:,['YEAR','Month','Day']])
df

Unnamed: 0,INDEX,YEAR,Month,avg_rainfall,Date
0,0,1901,1,3.5,1901-01-01
117,117,1901,2,18.8,1901-02-01
234,234,1901,3,7.1,1901-03-01
351,351,1901,4,67.2,1901-04-01
468,468,1901,5,65.5,1901-05-01
...,...,...,...,...,...
818,818,2017,7,76.8,2017-07-01
935,935,2017,8,105.8,2017-08-01
1052,1052,2017,9,206.6,2017-09-01
1169,1169,2017,11,4.9,2017-11-01


In [5]:
X=np.asanyarray(df[['YEAR','Month']]).astype('int')
y=np.asanyarray(df['avg_rainfall']).astype('int')

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [7]:
from lightgbm import LGBMRegressor

model_lgb = LGBMRegressor()
model_lgb.fit(X_train, y_train)

y_pred_lgb = model_lgb.predict(X_train)

In [8]:
error=mean_squared_error(y_train, y_pred_lgb, sample_weight=None, multioutput='uniform_average', squared=False)
print(error)

26.75813179543014


In [9]:
y_pred_lgb = model_lgb.predict(X_test)

error=mean_squared_error(y_test, y_pred_lgb, sample_weight=None, multioutput='uniform_average', squared=False)
print(error)

35.05423861757538


In [10]:
import xgboost as xgb

xg_reg = xgb.XGBRegressor()
xg_reg.fit(X_train,y_train)

y_pred_xgb = xg_reg.predict(X_train)

In [11]:
error=mean_squared_error(y_train, y_pred_xgb, sample_weight=None, multioutput='uniform_average', squared=False)
print(error)

11.276584324264357


In [12]:
y_pred_lgb = xg_reg.predict(X_test)

error=mean_squared_error(y_test, y_pred_lgb, sample_weight=None, multioutput='uniform_average', squared=False)
print(error)

39.82378689505708


In [13]:
from sklearn.tree import DecisionTreeRegressor

dectree = DecisionTreeRegressor(max_depth=10)
dectree.fit(X_train,y_train)

y_pred_tree = dectree.predict(X_train)

In [14]:
error=mean_squared_error(y_train, y_pred_tree, sample_weight=None, multioutput='uniform_average', squared=False)
print(error)

21.828186208940863


In [15]:
y_pred_tree = dectree.predict(X_test)
error=mean_squared_error(y_test, y_pred_tree, sample_weight=None, multioutput='uniform_average', squared=False)
print(error)

42.35618137220603


In [16]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train,y_train)

y_pred_rf = rf.predict(X_train)

In [17]:
error=mean_squared_error(y_train, y_pred_rf, sample_weight=None, multioutput='uniform_average', squared=False)
print(error)

14.46518797358372


In [18]:
y_pred_rf = dectree.predict(X_test)
error=mean_squared_error(y_test, y_pred_rf, sample_weight=None, multioutput='uniform_average', squared=False)
print(error)

42.35618137220603
