In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
import missingno as ms
warnings.simplefilter('ignore')

In [2]:
data = pd.read_csv('../input/apparent-temperature-prediction/weather_data.csv')
data.head()

In [3]:
data.tail()

In [4]:
data.isnull().sum()

In [5]:
data = data.drop(['Summary', 'Precip Type','Loud Cover', 'Daily Summary','Wind Bearing (degrees)'],axis =1)

In [6]:
data['temp. difference'] = (data['Temperature (C)'] - data['Apparent Temperature (C)'])

In [7]:
data['Formatted Date'] = pd.to_datetime(data['Formatted Date'])

In [8]:
data.head()

In [9]:
data.describe()

In [10]:
data.dtypes

In [11]:
indexdata = data.sort_values(by = ['Formatted Date'])
indexdata = indexdata.set_index('Formatted Date')
indexdata.index
indexdata.index.drop_duplicates(keep='first')
indexdata.dropna()
indexdata.head()

In [12]:
data = indexdata
data.head()

In [13]:
fig = plt.figure(figsize=(10,10))
plt.plot(data['Apparent Temperature (C)'])

In [14]:
data.index = pd.to_datetime(indexdata.index,utc =True)
data2 = data.resample(rule='D').mean()
data2.head()

In [15]:
data2.isnull().sum()

In [16]:
data2.dtypes

In [17]:
data2.describe()

In [18]:
data2 = data2.fillna(data2.mean())

In [19]:
data2.head()
data2.astype(int)

In [20]:
winter = data2['2006-01-01' : '2006-02-28']
winter2 =data2['2006-12-01' : '2006-12-31']
spring = data2['2006-03-01' : '2006-05-31']
summer = data2['2006-06-01' : '2006-08-31']

In [21]:
winter.plot(y=['Apparent Temperature (C)' ,'Temperature (C)'])

#plt.plot(winter2['Apparent Temperature (C)'], winter2['Temperature (C)'])

In [22]:
spring.plot(y=['Apparent Temperature (C)' ,'Temperature (C)'])

In [23]:
summer.plot(y=['Apparent Temperature (C)' ,'Temperature (C)'])

In [24]:
plt.plot(data2['Apparent Temperature (C)'],color ='red')
plt.legend('apparent_temp.')
plt.plot(data2['Temperature (C)'],color ='blue')
plt.legend('temperature')

In [25]:
data2.corr()
sns.heatmap(data2.corr())

In [26]:
y = data2['Apparent Temperature (C)']
x= data2.drop('Apparent Temperature (C)',axis =1)

In [27]:
x.shape, y.shape

In [28]:
from sklearn.model_selection import train_test_split , cross_val_score

In [29]:
from sklearn.metrics import accuracy_score,mean_squared_error

In [32]:
def model_train(model,x,y):
    
    x_train,x_test,y_train,y_test = train_test_split(x,y)
    model.fit(x_train,y_train)
    predict = model.predict(x_test)
    score = model.score(x_test,y_test)
    print('model score ' , score)
    
    val_score = cross_val_score(model,x,y,cv=5)
    print('cross val score' ,np.mean(val_score))
    
    error = mean_squared_error(y_test,predict)
    print('error = ',error)
    dataframe = pd.DataFrame({'actual': y_test, 'predicted': predict})
    print(dataframe)
    

In [33]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model_train(model,x,y)

In [34]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model_train(model,x,y)

In [35]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()

model_train(model,x,y)

In [36]:
from xgboost import XGBRegressor

model = XGBRegressor()

model_train(model,x,y)

In [40]:
#for xgboost only

from sklearn.model_selection import GridSearchCV

params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}

clf = GridSearchCV(estimator=model, 
                   param_grid=params,
                   scoring='neg_mean_squared_error',
                   verbose=1)
clf.fit(model,x, y)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))