In [None]:
import os
import pandas as pd
import regex
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("../input/weatherHistory.csv")

renamed_cols = {
    "Precip Type": "PrecipitationType",
    "Formatted Date": "Date", 
    "Temperature (C)": "Temperature",
    "Apparent Temperature (C)": "ApparentTemperature",
    "Wind Speed (km/h)": "WindSpeed", 
    "Wind Bearing (degrees)": "WindBearing", 
    "Visibility (km)": "Visibility", 
    "Loud Cover": "CloudCover", 
    "Pressure (millibars)": "Pressure", 
    "Daily Summary": "DailySummary"
}

df.rename(columns = renamed_cols, inplace = True)
df.fillna(method = "bfill", inplace=True)

label_encoder = LabelEncoder()

df["Summary"] = label_encoder.fit_transform(df["Summary"])
df["DailySummary"] = label_encoder.fit_transform(df["DailySummary"])
df.drop(columns = ["Date"], inplace = True)
df = pd.get_dummies(df, columns=['PrecipitationType'])

df.head(3)

In [None]:
df.corr()

In [None]:
import warnings
warnings.filterwarnings('ignore')

# overview of the data
df.info()
df.describe().transpose()

# plot the distribution
fig = plt.figure(figsize=(10,7))
fig.add_subplot(2,1,1)
sns.distplot(df['Visibility'])

In [None]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(df, df['Visibility'], test_size=0.33)

# linear regression model
model = LinearRegression()
model.fit(X_train, Y_train)
Y_predicted = model.predict(X_test)
print('Mean squared error: ', mean_squared_error(Y_test, Y_predicted), ', mean absolute error: ', mean_absolute_error(Y_test, Y_predicted))