In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline

In [None]:
#reading the file
df = pd.read_csv('../input/szeged-weather/weatherHistory.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df = df.drop(['Loud Cover', 'Summary', 'Daily Summary'], axis=1)

In [None]:
df['Precip Type'].value_counts()

In [None]:
df['Precip Type'] = df['Precip Type'].apply(lambda x: 1 if x == 'snow' else 0)

In [None]:
#converting fromatted date to date data type 
df['Formatted Date']=pd.to_datetime(df['Formatted Date'], utc=True)

In [None]:
df['Year']=df['Formatted Date'].dt.year
df['Month']=df['Formatted Date'].dt.month
df['Day']=df['Formatted Date'].dt.day

In [None]:
df.head()

In [None]:
#plotting histogram
df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
df.rename(columns={'Temperature (C)':'Temperature','Apparent Temperature (C)':'Apparent Temperature'},inplace=True)

In [None]:
#train & test splitiing
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=18)

In [None]:
#Data Visualization

df.plot(kind='scatter', x='Humidity', y='Visibility (km)')

In [None]:
corr_matrix = df.corr()

In [None]:
corr_matrix['Temperature'].sort_values(ascending=False)

In [None]:
#Above corr_matrix shows that, apparent temp is highly correlated with temp which is quite obvious
#also humidty is quite negatively related to temp

In [None]:
#just 500 rows were missing so we can drop them
df = df.dropna()

In [None]:
df = df.drop('Formatted Date', axis=1)

In [None]:
#splitting into X and y
X = df.drop('Temperature', axis=1).copy()
y = df['Temperature']

In [None]:
#train & test splitiing
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=18)

In [None]:
df.info()

In [None]:
#linear regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
df_predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train, df_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
#cross validation
scores = cross_val_score(lin_reg, X_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(lin_rmse_scores)

In [None]:
#decision tree
des_tree =  DecisionTreeRegressor()
des_tree.fit(X_train, y_train)

In [None]:
df2_predictions = des_tree.predict(X_train)
lin_mse = mean_squared_error(y_train, df2_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
scores = cross_val_score(des_tree, X_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)
des_rmse_scores = np.sqrt(-scores)

display_scores(des_rmse_scores)

In [None]:
#Random Forest
ran_for = RandomForestRegressor()
ran_for.fit(X_train, y_train)

In [None]:
df3_predictions = ran_for.predict(X_train)
ran_mse = mean_squared_error(y_train, df3_predictions)
ran_rmse = np.sqrt(ran_mse)
ran_rmse

In [None]:
#Random Forest is best model for now.