In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import datetime

from sklearn.impute import SimpleImputer
from sklearn.feature_selection import f_regression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 

In [None]:
data_raw = pd.read_csv("/kaggle/input/shanghai-air-pollution-and-wheather-20142021/shanghai-air-quality.csv")

In [None]:
data_raw.head()

In [None]:
data_raw.dtypes

# Let's change the data type of "date" column to a datetime data type

In [None]:
df = data_raw
df.date = pd.to_datetime(df['date'], format="%Y-%m-%d") 
df_sorted = df.sort_values(by= ["date"])
df_sorted.set_index("date")

In [None]:
df_sorted

In [None]:
df_sorted.columns

In [None]:
df_sorted.columns = ['date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co']
df = df_sorted

In [None]:
for col in ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co']:
    for i in range(len(df[col])):
        df[col][i] = df[col][i].strip()
    df[col] = pd.to_numeric(df[col])

In [None]:
shanghai = pd.read_csv("/kaggle/input/shanghai-air-pollution-and-wheather-20142021/shanghai.csv"); shanghai

In [None]:
shanghai.columns = ['date', 'maxtempC', 'mintempC', 'totalSnow_cm', 'sunHour',
       'uvIndex', 'moon_illumination', 'moonrise', 'moonset', 'sunrise',
       'sunset', 'DewPointC', 'FeelsLikeC', 'HeatIndexC', 'WindChillC',
       'WindGustKmph', 'cloudcover', 'humidity', 'precipMM', 'pressure',
       'tempC', 'visibility', 'winddirDegree', 'windspeedKmph', 'location']
shanghai.date = pd.to_datetime(shanghai['date'], format="%Y-%m-%d") ; 

In [None]:
shanghai.isnull().sum()

In [None]:
merged_data = df.merge(shanghai, how = "left", on = "date")
merged_data.head()

In [None]:
merged_data.isnull().sum()

# Handling the missing values

In [None]:
data_cleaned = merged_data;
data_cleaned.head()

In [None]:
data_cleaned.drop(data_cleaned.index[-1], inplace=True)
data_cleaned.drop(['moonrise', 'moonset', 'sunrise', 'sunset','location'], axis=1, inplace=True)

In [None]:
df= data_cleaned

To fill the missing values we will not use the full mean value of the column but rather use 10 days interval. Because the values change too much through out the year and the mean of the of whole year would not be a good fit for filling the data

In [None]:
for i in range(2):
    for col in ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co']:
        #print (col * 3)
        for i in range(len(df[col])):
            if np.isnan (df[col][i]):
                #print("index = " + str(i))
                df[col][i] = (np.average(df[col][i+1:i+5]) + np.average(df[col][i-5:i-1]))/2

In [None]:
df.isnull().sum()

There is a big gap for the o3 measurements data therefore we cannot use the 10 day interval anymore for the o3. Instead we will use the SimpleImputer from Sklearn to fit the missing data for only o3 

In [None]:
df1 = df
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean = imp_mean.fit(df1.iloc[:,1:7])
df1.iloc[:,1:7] = imp_mean.transform(df1.iloc[:,1:7])
df1.isnull().sum()

# Calculating the Sub_Index Values for the air pollution and calculating the AQI

To calculate the Air Quality Index we should start calculating the sub index values of the responsible pollution values. The highest index value is accepted to be AQI.

In [None]:
## PM2.5 Sub-Index calculation
def get_PM25_subindex(x):
    if x <= 30:
        return x * 50 / 30
    elif x <= 60:
        return 50 + (x - 30) * 50 / 30
    elif x <= 90:
        return 100 + (x - 60) * 100 / 30
    elif x <= 120:
        return 200 + (x - 90) * 100 / 30
    elif x <= 250:
        return 300 + (x - 120) * 100 / 130
    elif x > 250:
        return 400 + (x - 250) * 100 / 130
    else:
        return 0
    
## PM10 Sub-Index calculation
def get_PM10_subindex(x):
    if x <= 50:
        return x
    elif x <= 100:
        return x
    elif x <= 250:
        return 100 + (x - 100) * 100 / 150
    elif x <= 350:
        return 200 + (x - 250)
    elif x <= 430:
        return 300 + (x - 350) * 100 / 80
    elif x > 430:
        return 400 + (x - 430) * 100 / 80
    else:
        return 0    

## SO2 Sub-Index calculation
def get_SO2_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 380:
        return 100 + (x - 80) * 100 / 300
    elif x <= 800:
        return 200 + (x - 380) * 100 / 420
    elif x <= 1600:
        return 300 + (x - 800) * 100 / 800
    elif x > 1600:
        return 400 + (x - 1600) * 100 / 800
    else:
        return 0
    



## NOx Sub-Index calculation
def get_NOx_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 180:
        return 100 + (x - 80) * 100 / 100
    elif x <= 280:
        return 200 + (x - 180) * 100 / 100
    elif x <= 400:
        return 300 + (x - 280) * 100 / 120
    elif x > 400:
        return 400 + (x - 400) * 100 / 120
    else:
        return 0
    


## CO Sub-Index calculation
def get_CO_subindex(x):
    if x <= 1:
        return x * 50 / 1
    elif x <= 2:
        return 50 + (x - 1) * 50 / 1
    elif x <= 10:
        return 100 + (x - 2) * 100 / 8
    elif x <= 17:
        return 200 + (x - 10) * 100 / 7
    elif x <= 34:
        return 300 + (x - 17) * 100 / 17
    elif x > 34:
        return 400 + (x - 34) * 100 / 17
    else:
        return 0

    
## O3 Sub-Index calculation
def get_O3_subindex(x):
    if x <= 50:
        return x * 50 / 50
    elif x <= 100:
        return 50 + (x - 50) * 50 / 50
    elif x <= 168:
        return 100 + (x - 100) * 100 / 68
    elif x <= 208:
        return 200 + (x - 168) * 100 / 40
    elif x <= 748:
        return 300 + (x - 208) * 100 / 539
    elif x > 748:
        return 400 + (x - 400) * 100 / 539
    else:
        return 0

In [None]:
df["pm25_sub_index"] = df["pm25"].apply(lambda x: get_PM25_subindex(x))
df["pm10_sub_index"] = df["pm10"].apply(lambda x: get_PM10_subindex(x))
df["o3_sub_index"] = df["o3"].apply(lambda x: get_O3_subindex(x))
df["no2_sub_index"] = df["no2"].apply(lambda x: get_NOx_subindex(x))
df["so2_sub_index"] = df["so2"].apply(lambda x: get_SO2_subindex(x))
df["co_sub_index"] = df["co"].apply(lambda x: get_CO_subindex(x))
df.head()

In [None]:
df["AQI"] = round(df[['pm25_sub_index', 'pm10_sub_index', 'o3_sub_index', 'no2_sub_index', 'so2_sub_index', 'co_sub_index']].max(axis = 1))
df.head()

In [None]:
color_AQI = ["maroon", "purple", "red", "orange", "yellow","green"]

def get_AQI_bucket(x):
    if x <= 50:
        return "Good"
    elif x <= 100:
        return "Moderate"
    elif x <= 150:
        return "Unhealthy for sensitive groups"
    elif x <= 200:
        return "Unhealthy"
    elif x <= 300:
        return "Very unhealthy"
    elif x > 300:
        return "Hazardous"
    else:
        return np.NaN

In [None]:
df["AQI_Explained"] = df["AQI"].apply(lambda x: get_AQI_bucket(x))

In [None]:
df.head()

In [None]:
plt.figure(figsize=(16,8))
sns.set_palette(color_AQI)
category_order = ["Hazardous",
                 "Very unhealthy",
                  "Unhealthy",
                  "Unhealthy for sensitive groups",
                  "Moderate",
                  "Good"
                 ]

sns.catplot(x="AQI_Explained", data = df, kind="count", order=category_order)

plt.xticks(rotation=90)
plt.show()


In [None]:
df_time = df.set_index("date")

In [None]:
df_time

# Explaining the Data, and checking the COVID-19 Effect on AQI

In [None]:
df_time.loc["2020"].AQI.mean()

In [None]:
years = ["2021","2020","2019","2018","2017","2016","2015","2014"]
for year in years:
    sns.set_palette(color_AQI)
    sns.catplot( x= "AQI_Explained",data = df_time.loc[year],kind = "count", order=category_order)
    plt.xticks(rotation=90)
    plt.title(year)


In [None]:
for year in years:
    print (year)
    print (df_time.loc[year]["AQI_Explained"].value_counts())
    print()
    print ("--------------"*3)
    print()

In [None]:
for year in reversed(years) :
    print()
    
    days = df_time.loc[year]["AQI_Explained"].count()
    Hazardous = df_time.loc[year]["AQI_Explained"].value_counts()["Hazardous"]
    Percent = round ((Hazardous/days)*100)
    print("In " + str(year) + " %"+str(Percent)+ " of the days were Hazardous")
    
    days = df_time.loc[year]["AQI_Explained"].count()
    Hazardous = df_time.loc[year]["AQI_Explained"].value_counts()["Very unhealthy"]
    Percent = round ((Hazardous/days)*100)
    print("In " + str(year) + " %"+str(Percent)+ " of the days were Very unhealthy")
    
    days = df_time.loc[year]["AQI_Explained"].count()
    Hazardous = df_time.loc[year]["AQI_Explained"].value_counts()["Unhealthy"]
    Percent = round ((Hazardous/days)*100)
    print("In " + str(year) + " %"+str(Percent)+ " of the days were Unhealthy")
    
    days = df_time.loc[year]["AQI_Explained"].count()
    Hazardous = df_time.loc[year]["AQI_Explained"].value_counts()[["Unhealthy","Very unhealthy","Hazardous"]].sum()
    Percent = round ((Hazardous/days)*100)
    print("In " + str(year) + " %"+str(Percent)+ " of the days the air was unhealthy for the general public")
    
    print()
    print ("------------------"*3)
    print()

As we have seen on mainly from the start of COVID-19, due to lockdowns the AQI index has been effected on 2020, therefore we will not use the data of COVID-19 period which would effect our prediction model.


We will also write the cleaned data to a new CSV file to be used on further analysis. Such as if we want to add another variable to make our model better, like energy consumption. But I couldn't get the energy consumptions data yet. 
(the main reason is all the energy consumption datas of Shanghai are paid data)

In [None]:
df_time.drop(['pm25', 'pm10', 'o3', 'no2', 'so2', 'co','pm25_sub_index', 'pm10_sub_index',
       'o3_sub_index', 'no2_sub_index', 'so2_sub_index', 'co_sub_index'],axis=1,inplace=True)
# We drop the sub-indexes and also the pollution correspondors 
#since we have the AQI calculated, 
#they are no longer necessary on our cleaned data

df_time.to_csv("All Values Cleaned Shanghai1.csv")
#df_time.to_excel("Shanghai AQI Analysis Data.xlsx")

In [None]:
ls

# Daily Predictions

## 1 - Feature Selection

For this part of the analysis we will use two seperate methods and try to see the difference between two seperate methods.

- Lasso Feature Selection
- F Statistics

In [None]:
df = pd.read_csv("All Values Cleaned Shanghai1.csv")
df.date = pd.to_datetime(df['date'], format="%Y-%m-%d") 
df_sorted = df.sort_values(by= ["date"])
df_sorted.set_index("date",inplace = True)
df = df_sorted
df.columns
df

In [None]:
columns_to_be_used = df.columns[1:(len(df.columns)-2)]

In [None]:
x = df[columns_to_be_used]
y = df.AQI
x

### 1.1 Feature Selection Using F Statistics

In [None]:
p_values = f_regression (x,y)[1]
p_values.round(5)

In [None]:
columns_to_be_selected_using_f_statistics =[]
for index, i in enumerate(p_values.round(3)):
    if i > 0 :
        columns_to_be_selected_using_f_statistics.append(x.columns[index])
        
columns_to_be_selected_using_f_statistics

In [None]:
x_fstat = df.drop(columns_to_be_selected_using_f_statistics, axis=1)

In [None]:
x_fstat.drop(["AQI","AQI_Explained"],axis=1,inplace=True)

### 1.2 Lasso Feature Selection

In [None]:
names = x.columns
lasso = Lasso(alpha=0.2)
lasso_coef = lasso.fit(x, y).coef_

lasso_coef_data = pd.DataFrame({"Features":names,"lasso_coef": lasso_coef })
sns.barplot(data=lasso_coef_data, x= "Features", y = "lasso_coef")
plt.xticks(rotation = 75)
plt.show()

In [None]:
lasso_coef = np.abs(lasso_coef)
lasso_Coef = pd.DataFrame({'Columns':names,'Coef':lasso_coef})
lasso_Coef

In [None]:
columns_to_be_delated = lasso_Coef[lasso_Coef.Coef < 1].Columns; columns_to_be_delated

In [None]:
x.drop(columns_to_be_delated,axis=1,inplace=True); x

## 2 Predictions

###  2.1 Lasso Selection Features


In [None]:
x_train = x.loc["2014-01":"2017-12"]
y_train = y.loc["2014-01":"2017-12"]

x_test = x.loc["2018-01":"2019-10"]
y_test = y.loc["2018-01":"2019-10"]

In [None]:
#Linear Regression

regression = LinearRegression()
regression.fit(x_train,y_train)
y_predicted_sk_simple = regression.predict(x_test)
score_reg = regression.score(x_test,y_test)


#Cross Validation Regression

reg= LinearRegression()
cv_resutls = cross_val_score(reg,x_train,y_train,cv=4); cv_resutls

cross_y_pred = cross_val_predict(reg,x_test,y_test,cv=4)
score_cross = np.mean(cv_resutls)

#Ridge Regression

ridge = Ridge(alpha=0.1,normalize=True)
ridge.fit(x_train,y_train)
ridge_pred = ridge.predict(x_test)
score_ridge = ridge.score (x_test,y_test)

#Lasso Regression

lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(x_train, y_train)
lasso_pred = lasso.predict(x_test)
score_lasso = lasso.score(x_test, y_test)

In [None]:

score_comparison_daily = pd.DataFrame([score_reg,score_cross,score_ridge,score_lasso],
                                columns=["Scores"],
                                index= ['Linear_Reg','Cross_Val','Ridge','Lasso'],dtype=float)

In [None]:
score_comparison_daily

### 2.2 F Statistics Features

In [None]:
x = x_fstat

x_train = x.loc["2014-01":"2017-12"]
y_train = y.loc["2014-01":"2017-12"]

x_test = x.loc["2018-01":"2019-10"]
y_test = y.loc["2018-01":"2019-10"]

In [None]:
#Linear Regression

regression = LinearRegression()
regression.fit(x_train,y_train)
y_predicted_sk_simple = regression.predict(x_test)
score_reg = regression.score(x_test,y_test)


#Cross Validation Regression

reg= LinearRegression()
cv_resutls = cross_val_score(reg,x_train,y_train,cv=4); cv_resutls

cross_y_pred = cross_val_predict(reg,x_test,y_test,cv=4)
score_cross = np.mean(cv_resutls)

#Ridge Regression

ridge = Ridge(alpha=0.1,normalize=True)
ridge.fit(x_train,y_train)
ridge_pred = ridge.predict(x_test)
score_ridge = ridge.score (x_test,y_test)

#Lasso Regression

lasso = Lasso(alpha=0.1, normalize=True)
lasso.fit(x_train, y_train)
lasso_pred = lasso.predict(x_test)
score_lasso = lasso.score(x_test, y_test)

In [None]:
comparison_daily_F = pd.DataFrame({"Linear_Reg":y_predicted_sk_simple, 
                           "Cross Val" :cross_y_pred, 
                           "Ridge": ridge_pred,
                          "Lasso": lasso_pred,
                          "AQI":y_test},index=y_test.index)

In [None]:
comparison_daily_F

In [None]:
score_comparison_daily_F = pd.DataFrame([score_reg,score_cross,score_ridge,score_lasso],
                                columns=["Scores"],
                                index= ['Linear_Reg','Cross_Val','Ridge','Lasso'],dtype=float)

In [None]:
score_comparison_daily_F

In [None]:
daily_score_comp = score_comparison_daily.merge(score_comparison_daily_F,
                                                on = score_comparison_daily.index,
                                                how= "left", suffixes=("_Lasso","_F"))

In [None]:
daily_score_comp.columns =["Algorithm","Lesso_Feature","F_Stat_Feature"]
daily_score_comp.set_index("Algorithm",inplace=True)

In [None]:
daily_score_comp

Daily Predictions has small score levels. Due to the difficulty of the prediction we can try to predictions monthly which can have a much higher prediction score

# 3. MONTHLY ANALYSIS

In [None]:
df = pd.read_csv("All Values Cleaned Shanghai1.csv")
df.date = pd.to_datetime(df['date'], format="%Y-%m-%d") 

df_sorted = df.sort_values(by= ["date"])
monthly = df.groupby(df['date'].dt.strftime('%Y-%m'))['AQI'].mean()
df_month = pd.DataFrame(monthly)
y = df_month.sort_values(by= ["date"])

In [None]:
monthly = df.groupby(df['date'].dt.strftime('%Y-%m'))['mintempC', 'uvIndex', 'DewPointC', 
                                                      'FeelsLikeC', 'HeatIndexC',
       'WindChillC', 'WindGustKmph', 'cloudcover', 'humidity', 'precipMM',
       'pressure', 'tempC', 'winddirDegree', 'windspeedKmph'].mean()
df_month = pd.DataFrame(monthly)
x = df_month.sort_values(by= ["date"])
x

## Feature Selections

### 3.1 F Statistics Feature Selections

In [None]:
p_values = f_regression (x,y)[1]
print(p_values.round(3))

columns_to_be_selected_using_f_statistics =[]
for index, i in enumerate(p_values.round(10)):
    if i > 0 :
        columns_to_be_selected_using_f_statistics.append(x.columns[index])
        
print(columns_to_be_selected_using_f_statistics)

In [None]:
x_fstat = x.drop(columns_to_be_selected_using_f_statistics,axis=1); x_fstat.columns

### 3.2 Lasso Feature Selection

In [None]:
names = x.columns
lasso = Lasso(alpha=0.2)
lasso_coef = lasso.fit(x, y).coef_

lasso_coef_data = pd.DataFrame({"Features":names,"lasso_coef": lasso_coef })
sns.barplot(data=lasso_coef_data, x= "Features", y = "lasso_coef")
plt.xticks(rotation = 75)
plt.show()

In [None]:
lasso_coef = np.abs(lasso_coef)
lasso_Coef = pd.DataFrame({'Columns':names,'Coef':lasso_coef})
lasso_Coef

In [None]:
columns_to_be_delated = lasso_Coef[lasso_Coef.Coef < 1].Columns; print(columns_to_be_delated)

lasso_feature_x = x.drop(columns_to_be_delated,axis=1); lasso_feature_x.shape

## 4. Monthly Predictions

### 4.1 LASSO FEATURE SELECTED

In [None]:
x = lasso_feature_x
x_train = x.loc["2014-01":"2017-12"]
y_train = y.loc["2014-01":"2017-12"]

x_test = x.loc["2018-01":"2019-10"]
y_test = y.loc["2018-01":"2019-10"]

In [None]:
def Ridge_Regression (x_train,y_train,x_test,y_test):
    from sklearn.linear_model import Ridge
    
    Alpha_array = []
    Score_array = []
    for alpha in range (1,9):

        ridge = Ridge(alpha=(alpha/10),normalize=True)
        ridge.fit(x_train,y_train)
        ridge_pred = ridge.predict(x_test)
        score_ridge = ridge.score (x_test,y_test)
        Score_array.append(score_ridge) 
        Alpha_array.append(alpha/10)
        print ("alpha = " + str(alpha)+ "  Score = " + str(score_ridge))

    score_ridge_df = pd.DataFrame({"Scores": Score_array,"Alpha":Alpha_array})
    Score_ridge = score_ridge_df[score_ridge_df["Scores"] == score_ridge_df.Scores.max()]
    np_Score_ridge = Score_ridge.to_numpy()
    
      
    ridge = Ridge(alpha=np_Score_ridge[0,1],normalize=True)
    ridge.fit(x_train,y_train)
    ridge_pred = ridge.predict(x_test)
    
    
    return np_Score_ridge[0,0] , ridge_pred


In [None]:
def Lasso_Regression (x_train,y_train,x_test,y_test):
    from sklearn.linear_model import Lasso

    Alpha_array = []
    Score_array = []
    for alpha in range (1,10):
        lasso = Lasso(alpha=(alpha/10),normalize=True)
        lasso.fit(x_train,y_train)
        lasso_pred = lasso.predict(x_test)
        score_lasso = lasso.score (x_test,y_test)
        Score_array.append(score_lasso) 
        Alpha_array.append(alpha/10)
        
        print ("alpha = " + str(alpha)+ "  Score = " + str(score_lasso))
    score_lasso_df = pd.DataFrame({"Scores": Score_array,"Alpha":Alpha_array})      
        
    Score_lasso = score_lasso_df[score_lasso_df["Scores"] == score_lasso_df.Scores.max()]
    
    np_Score_Lasso = Score_lasso.to_numpy()

    lasso = Lasso(alpha=np_Score_Lasso[0,1] , normalize=True)
    lasso.fit(x_train,y_train)
    lasso_pred = lasso.predict(x_test)
    return np_Score_Lasso[0,0] , lasso_pred


## Regression Models

In [None]:
#Linear Regression

regression = LinearRegression()
regression.fit(x_train,y_train)
y_predicted_sk_simple = regression.predict(x_test)
score_reg = regression.score(x_test,y_test)


#Cross Validation Regression

reg= LinearRegression()
cv_resutls = cross_val_score(reg,x_train,y_train,cv=4); cv_resutls

cross_y_pred = cross_val_predict(reg,x_test,y_test,cv=4)
score_cross = np.mean(cv_resutls)

#Ridge Regression

#ridge = Ridge(alpha=0.1,normalize=True)
#ridge.fit(x_train,y_train)
#ridge_pred = ridge.predict(x_test)
#score_ridge = ridge.score (x_test,y_test)
print("Ridge Regression")
score_ridge , ridge_pred = Ridge_Regression(x_train,y_train,x_test,y_test)


#Lasso Regression

#lasso = Lasso(alpha=0.1, normalize=True)
#lasso.fit(x_train, y_train)
#lasso_pred = lasso.predict(x_test)
#score_lasso = lasso.score(x_test, y_test)
print()
print("Lasso Regression")

score_lasso , lasso_pred = Lasso_Regression(x_train,y_train,x_test,y_test)

Gather all the data for cross check to see the values in each different regression models

In [None]:
data_cross_valid = pd.DataFrame(cross_y_pred,index=y_test.index)
data_ridge = pd.DataFrame(ridge_pred,index=y_test.index)
data_lineer = pd.DataFrame(y_predicted_sk_simple,index=y_test.index)
data_lasso = pd.DataFrame(lasso_pred,index=y_test.index)

data1 = data_cross_valid.merge(data_ridge, on = "date", how="left")
data1.columns = ["Cross_Valid", "Ridge"]

data2 = data1.merge(data_lasso,on = "date", how="left")
data2.columns = ["Cross_Valid", "Ridge", "Lasso"]

data3 = data2.merge (data_lineer, on = "date", how = "left")
data3.columns = ["Cross_Valid", "Ridge", "Lasso", "Lineer"]

prediction_compare_monthly = data3.merge (y_test,on ="date", how= "left")
prediction_compare_monthly.columns = ["Cross_Valid", "Ridge", "Lasso", "Lineer", "AQI"]

prediction_compare_monthly

In [None]:
def convert_to_explained_AQI (data):
    explained_AQI = data
    for i in data.columns:
        explained_AQI[i] = data[i].apply(lambda x: get_AQI_bucket(x))
    return explained_AQI
    

In [None]:
my_data_Lasso_Selected = convert_to_explained_AQI(prediction_compare_monthly); my_data_Lasso_Selected

In [None]:
score_comparison_Monthly = pd.DataFrame([score_reg,score_cross,score_ridge,score_lasso],
                                columns=["Scores"],
                                index= ['Linear_Reg','Cross_Val','Ridge','Lasso'],dtype=float)
score_comparison_Monthly

## 4.2  F Statistics Features

In [None]:
x = x_fstat

x_train = x.loc["2014-01":"2017-12"]
y_train = y.loc["2014-01":"2017-12"]

x_test = x.loc["2018-01":"2019-10"]
y_test = y.loc["2018-01":"2019-10"]

In [None]:
#Linear Regression

regression = LinearRegression()
regression.fit(x_train,y_train)
y_predicted_sk_simple = regression.predict(x_test)
score_reg = regression.score(x_test,y_test)


#Cross Validation Regression

reg= LinearRegression()
cv_resutls = cross_val_score(reg,x_train,y_train,cv=4); cv_resutls

cross_y_pred = cross_val_predict(reg,x_test,y_test,cv=4)
#score_cross = r2_score(cross_y_pred, y_test)
y_true = y_test
y_pred1 = cross_y_pred
score_cross = r2_score(y_true, y_pred1)

#Ridge Regression

#ridge = Ridge(alpha=0.1,normalize=True)
#ridge.fit(x_train,y_train)
#ridge_pred = ridge.predict(x_test)
#score_ridge = ridge.score (x_test,y_test)
print()
print("Ridge Regression")

score_ridge , ridge_pred = Ridge_Regression(x_train,y_train,x_test,y_test)

#Lasso Regression

#lasso = Lasso(alpha=0.1, normalize=True)
#lasso.fit(x_train, y_train)
#lasso_pred = lasso.predict(x_test)
#score_lasso = lasso.score(x_test, y_test)

print()
print("Lasso Regression")

score_lasso , lasso_pred = Lasso_Regression(x_train,y_train,x_test,y_test)

In [None]:
data_cross_valid = pd.DataFrame(cross_y_pred,index=y_test.index)
data_ridge = pd.DataFrame(ridge_pred,index=y_test.index)
data_lineer = pd.DataFrame(y_predicted_sk_simple,index=y_test.index)
data_lasso = pd.DataFrame(lasso_pred,index=y_test.index)

data1 = data_cross_valid.merge(data_ridge, on = "date", how="left")
data1.columns = ["Cross_Valid", "Ridge"]

data2 = data1.merge(data_lasso,on = "date", how="left")
data2.columns = ["Cross_Valid", "Ridge", "Lasso"]

data3 = data2.merge (data_lineer, on = "date", how = "left")
data3.columns = ["Cross_Valid", "Ridge", "Lasso", "Lineer"]

pred_comp_monthly_F = data3.merge (y_test,on ="date", how= "left")
pred_comp_monthly_F.columns = ["Cross_Valid", "Ridge", "Lasso", "Lineer", "AQI"]


pred_comp_monthly_F

In [None]:
my_data_F_Selected = convert_to_explained_AQI(pred_comp_monthly_F); my_data_F_Selected

In [None]:
score_comp_Monthly_F = pd.DataFrame([score_reg,score_cross,score_ridge,score_lasso],
                                columns=["Scores"],
                                index= ['Linear_Reg','Cross_Val','Ridge','Lasso'],dtype=float)
score_comp_Monthly_F

In [None]:
import math
mse =[]
for i in [cross_y_pred,lasso_pred,y_predicted_sk_simple,ridge_pred]:
    
    mse.append(math.sqrt(mean_squared_error(y_test ,i)) )

mse

In [None]:
two_feature_selection_comp = score_comparison_Monthly.merge(score_comp_Monthly_F, 
                                                            how = "left", 
                                                            on=score_comparison_Monthly.index, 
                                                            suffixes=["_Lasso","_F"])
two_feature_selection_comp.columns = ["Algorithm","Lasso_Feature_Selection","F_Feature_Selection"]
two_feature_selection_comp.set_index("Algorithm",inplace=True)

In [None]:
features_used_F      = pd.DataFrame({"id":range(len(x_fstat.columns)),"F_Selected":x_fstat.columns})
features_used_Lasso  = pd.DataFrame ({"id":range(len(lasso_feature_x.columns)),"Lasso_Selected":lasso_feature_x.columns})


In [None]:
features_used = pd.merge(features_used_Lasso,features_used_F, left_on="id",right_on="id", how= "outer")
features_used.set_index("id",inplace=True)

### We have both used daily analysis and monthly analysis. 
#### Monthly mean predictions has worked much better on seeing the effects of the weather conditions on AQI
#### Therefore we will only show below the monthly results of the model on the conclusion

We have used 2 different feature selection methods 

1) Lasso Feature Selection 

2) F Feature Selection

The Features selected to be used in both regression models are as below:

In [None]:
features_used 

# According to both feature selection models the fallowing weather conditions has the highest impact on the AQI 

#### 1) Minimum Temp C

#### 2) Feels Like C 

#### 3) UV Index : 
The ultraviolet index, or UV index, is an international standard measurement of the strength of sunburn-producing ultraviolet (UV) radiation at a particular place and time.

#### 4) DewPointC : 
Represents the temperature to which air would have to be cooled to reach a level of moisture saturation. When it reaches the dew point, droplets of water, or dew, begin to form on solid objects like grass and cars.

#### 5) Heat Index C : 
represents how hot the temperature actually feels when humidity is considered. The more humid the air is, the less perspiration is able to evaporate, which cripples the human body’s cooling system and makes it feel hotter when it’s humid outside.

#### 6) Wind Chill C : 
Also known as the “feels-like” temperature, wind chill represents how cold the weather feels on human skin when the chilling effect of the wind is taken into consideration.

After selecting which features to be used, we have used 4 different regression algorithms for predictions

1) Linear Regression

2) Cross Validation

3) Ridge Regression

4) Lasso Regression

In [None]:
two_feature_selection_comp

## Decision Tree Reg

In [None]:
def decision_tree_reg (x_train, y_train, x_test, y_test):
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.metrics import mean_squared_error as MSE
   
    score_tree_array = []
    max_depth_array = []
    min_samp_array = []
    random_st_array =[]
    for max_depth in range(1,7):
        for min_samp in range(1,5):
            for random_st in range (10):
                dt = DecisionTreeRegressor(max_depth=max_depth,
                              min_samples_leaf=(min_samp/10),
                              random_state=random_st)
                dt.fit(x_train,y_train)
                score_tree_array.append(dt.score(x_test,y_test)) 
                max_depth_array.append(max_depth)
                min_samp_array.append(min_samp/10)
                random_st_array.append(random_st)

    score_tree_df = pd.DataFrame({"max_depth":max_depth_array,"min_samples_leaf":min_samp_array,"random_state": random_st_array,"Scores":score_tree_array})
    
    #if the max score can be achieved by several other settings then we can select the minimum values so that our model runs faster
    
    if score_tree_df[score_tree_df.Scores.max() == score_tree_df.Scores].Scores.count() > 1:
        max_depth = score_tree_df[score_tree_df.Scores.max() == score_tree_df.Scores].max_depth.min()
        min_leaf = score_tree_df[score_tree_df.Scores.max() == score_tree_df.Scores].min_samples_leaf.min()
        rnd_state = score_tree_df[score_tree_df.Scores.max() == score_tree_df.Scores].random_state.min()
        dt = DecisionTreeRegressor(max_depth=max_depth,
                              min_samples_leaf=min_leaf,
                              random_state=rnd_state)
        dt.fit(x_train,y_train)
        y_pred_tree = dt.predict(x_test)
        mse_dt = MSE (y_test,y_pred_tree)
        rmse_dt = mse_dt**(1/2)
    else:
        max_depth = score_tree_df[score_tree_df.Scores.max() == score_tree_df.Scores].max_depth
        min_leaf = score_tree_df[score_tree_df.Scores.max() == score_tree_df.Scores].min_samples_leaf
        rnd_state = score_tree_df[score_tree_df.Scores.max() == score_tree_df.Scores].random_state
        dt = DecisionTreeRegressor(max_depth=max_depth,
                              min_samples_leaf=min_leaf,
                              random_state=rnd_state)
        dt.fit(x_train,y_train)
        y_pred_tree = dt.predict(x_test)
        mse_dt = MSE (y_test,y_pred_tree)
        rmse_dt = mse_dt**(1/2)
    return  score_tree_df.Scores.max()  , rmse_dt , y_pred_tree

In [None]:
x = lasso_feature_x
x_train = x.loc["2014-01":"2017-12"]
y_train = y.loc["2014-01":"2017-12"]

x_test = x.loc["2018-01":"2019-10"]
y_test = y.loc["2018-01":"2019-10"]

In [None]:
 score_tree_lasso, rmse_dt_lasso, y_pre_tree_lasso = decision_tree_reg(x_train,y_train,x_test,y_test)

In [None]:
x = x_fstat
x_train = x.loc["2014-01":"2017-12"]
y_train = y.loc["2014-01":"2017-12"]

x_test = x.loc["2018-01":"2019-10"]
y_test = y.loc["2018-01":"2019-10"]

In [None]:
 score_tree_f, rmse_dt_f, y_pre_tree_f = decision_tree_reg(x_train,y_train,x_test,y_test)

In [None]:
df_tree = pd.DataFrame({"Lasso_Feature_Selection":score_tree_lasso,
                        "F_Feature_Selection": score_tree_f},index=["Decision_reg_tree"])
two_feature_sel_final = pd.concat([two_feature_selection_comp,df_tree])

In [None]:
two_feature_sel_final

# TENSORFLOW

In [None]:
import tensorflow as tf

In [None]:
df = pd.read_csv("All Values Cleaned Shanghai1.csv")
df.date = pd.to_datetime(df['date'], format="%Y-%m-%d") 

df_sorted = df.sort_values(by= ["date"])
monthly = df.groupby(df['date'].dt.strftime('%Y-%m'))['AQI'].mean()
df_month = pd.DataFrame(monthly)
y = df_month.sort_values(by= ["date"])

In [None]:
monthly = df.groupby(df['date'].dt.strftime('%Y-%m'))['mintempC', 'uvIndex', 'DewPointC', 
                                                      'FeelsLikeC', 'HeatIndexC',
       'WindChillC', 'WindGustKmph', 'cloudcover', 'humidity', 'precipMM',
       'pressure', 'tempC', 'winddirDegree', 'windspeedKmph'].mean()
df_month = pd.DataFrame(monthly)
x = df_month.sort_values(by= ["date"])
x

### Below we will define a NN engine which will help us analyze the steps virtually and then compare the results among each other, also will save us same space in our analysis

In [None]:
# IMPORTANT : x_train, y_train, x_test, y_test should be dataframes.

def tf_nn (x_train,y_train,x_test,y_test,steps_to_see,epochs): #epochs should be dividable by steps
    
    from sklearn.metrics import r2_score #will be used to calculate the score of our model 
    
    #first we change the data top arrays since tf works with arrays
    x_train_np = x_train.to_numpy()
    y_train_np = y_train.to_numpy()

    x_test_np = x_test.to_numpy()
    y_test_np = y_test.to_numpy()
    
    #second we create the model for our neural network 
    input_size = x_train_np.shape[1]
    output_size = y_train_np.shape[1]
    model = tf.keras.Sequential([tf.keras.layers.Dense(output_size)])
    
    #then we compile with nadam optimizer, there are other optimizers as well but for this example I go with nadam
    # also I will use the mean squared error for calculating the loss values
    model.compile (optimizer = "nadam", loss = "mean_squared_error")
    
    #this loop will run number_of_runs times for 500 steps each, 
    #then print out the predicted vaues after each 500 steps
    #so that we can see how our model comes closer to the actual values
    for i in range(int(epochs/steps_to_see)):
        # fitting the regression
        model.fit(x_train_np,y_train_np,epochs=(steps_to_see),verbose=0)
        
        # predicting the values
        np_predicted_tf = model.predict_on_batch(x_test_np); 
        
        # getting y_test data and y_predicted data to the same data frame for comparison
        tf_data_pred = pd.DataFrame( np_predicted_tf,index=range(y_test.AQI.count()));  
        tf_data_y = pd.DataFrame(y_test_np, index=range(y_test_np.size)); 
        #tf_data_pred["y_test"] = y_test
        tf_data_merged = tf_data_pred.merge(tf_data_y,on=tf_data_pred.index,how="left")
        tf_data_merged["key_0"]=y_test.index
        tf_data_merged.columns =["Date","Predicted_Y","Test_Y"]

        #you can change the size of the graphs that has been showed below
        sns.set(rc={'figure.figsize':(8,8)})
        
        #plotting the predictions over y_test data
        sns.lineplot(data= tf_data_merged, x="Date", y="Predicted_Y",label = "Predicted AQI")
        sns.lineplot(data= tf_data_merged, x="Date", y="Test_Y", label = "AQI")
        plt.xticks(rotation=90)
        plt.legend()
        plt.show()
        
        #score calculation
        R2 = r2_score(y_test_np, np_predicted_tf, multioutput='variance_weighted')
        print ("Score = " + str(R2))
        
        # root of mean squared error calculation
        def rmse(predictions, targets):
            return np.sqrt(((predictions - targets) ** 2).mean())
        
        print("--------------------------------")
        print("Number of Epochs = " + str(steps_to_see*(i+1)))
        print("--------------------------------")
        print ("Root of Mean Squared Error:")
        rmse_val = rmse(np_predicted_tf,y_test_np)
        print (rmse(np_predicted_tf,y_test_np))
        
    explained_tf_df = pd.DataFrame({"Score":R2 ,"Features used": x_train.shape[1],
                                    "Epochs": (epochs),"rmse":rmse_val}, index=range(1))
    
    # tf_data_merged is the compaison dataframe of y_test and y_pred values
    # R2 is the score of the model 
    # rmse_val is the root of mean_squared_error 
    # explained_df_tf is the data frame of R2, rmse, epoch number, and the total features used in the model 
    return tf_data_merged, R2, rmse_val , explained_tf_df



In [None]:
x = lasso_feature_x
x_train = x.loc["2014-01":"2017-12"]
y_train = y.loc["2014-01":"2017-12"]

x_test = x.loc["2018-01":"2019-10"]
y_test = y.loc["2018-01":"2019-10"]
y_test.AQI.count()

y_test_np = y_test.to_numpy()
y_test_np.size

In [None]:
tf_lasso_df, score_tf_lasso, rmse_val_tf_lasso, explained_tf_df1 = tf_nn (x_train,y_train,x_test,y_test,25000,100000)
# First number 25000 represents at how many epochs to show the graph
# The second 100000 represents how many epochs in total the model will run
# the total_epoch number should be dividable by epoch_steps  

In [None]:
x = x_fstat
x_train = x.loc["2014-01":"2017-12"]
y_train = y.loc["2014-01":"2017-12"]

x_test = x.loc["2018-01":"2019-10"]
y_test = y.loc["2018-01":"2019-10"]

In [None]:
tf_f_feat_df, score_tf_f_feat, rmse_val_tf_f_feat, explained_tf_df2 = tf_nn (x_train,y_train,x_test,y_test,25000,100000)
# First number 25000 represents at how many epochs to show the graph
# The second 100000 represents how many epochs in total the model will run
# the total_epoch number should be dividable by epoch_steps  

In [None]:
df_tf_scores = pd.DataFrame({"Lasso_Feature_Selection": score_tf_lasso,
                        "F_Feature_Selection": score_tf_f_feat},index=["TensorFlow-NN-Reg"])
two_feature_sel_final = pd.concat([two_feature_sel_final,df_tf_scores])

In [None]:
two_feature_sel_final

### An important Note!
#### The F_Feature_selection has been run 500 times, which makes a total of 500*500 = 250.000 which is a lot
#### For the first 3 run which makes 8*500 = 4000 steps, Lasso feature selection method works better for determining the variables.
#### Lasso_feature_selection had 9 different features
#### F_Selected_feature had 7 different features

# Below we will run the NN model with full features, without eliminating any

In [None]:
df = pd.read_csv("All Values Cleaned Shanghai1.csv")
df.date = pd.to_datetime(df['date'], format="%Y-%m-%d") 

df_sorted = df.sort_values(by= ["date"])
monthly = df.groupby(df['date'].dt.strftime('%Y-%m'))['AQI'].mean()
df_month = pd.DataFrame(monthly)
y = df_month.sort_values(by= ["date"])
monthly = df.groupby(df['date'].dt.strftime('%Y-%m'))['mintempC', 'uvIndex', 'DewPointC', 
                                                      'FeelsLikeC', 'HeatIndexC',
       'WindChillC', 'WindGustKmph', 'cloudcover', 'humidity', 'precipMM',
       'pressure', 'tempC', 'winddirDegree', 'windspeedKmph'].mean()
df_month = pd.DataFrame(monthly)
x = df_month.sort_values(by= ["date"])
x

In [None]:
x_train = x.loc["2014-01":"2017-12"]
y_train = y.loc["2014-01":"2017-12"]

x_test = x.loc["2018-01":"2019-10"]
y_test = y.loc["2018-01":"2019-10"]

In [None]:
tf_all_df, score_tf, rmse_val_tfv, explained_tf_df3 = tf_nn (x_train,y_train,x_test,y_test,25000,75000)
# First number 25000 represents at how many epochs to show the graph
# The second 100000 represents how many epochs in total the model will run
# the total_epoch number should be dividable by epoch_steps  

In [None]:
df_tf_scores_all = pd.DataFrame({"Lasso_Feature_Selection": score_tf,
                        "F_Feature_Selection": score_tf},index=["TensorFlw-All-Features"])
two_feature_sel_final_all = pd.concat([two_feature_sel_final,df_tf_scores_all])

In [None]:
two_feature_sel_final_all

### The last row TensorFlw-All_features means we didn't do any feature selection but feed all the features to the model
### Since the NN algorithm makes the bias of the unusefull features close to zero, we don't have to do the feature selection ourselves. 
#### After 1000 epochs we have similar results as the other machine learning algorithms. 

In [None]:
explained_tf_df = pd.concat([explained_tf_df2,explained_tf_df1,explained_tf_df3]); 
explained_tf_df.index = ["Lasso_Feature_selection", "F_Feature_Selection", "All_Features"]

In [None]:
explained_tf_df

# Finally lets check if we did the right to to not to include covid time period to our model

In [None]:
x_train = x.loc["2014-01":"2019-09"]
y_train = y.loc["2014-01":"2019-09"]

x_test = x.loc["2019-10":"2020-12"]
y_test = y.loc["2019-10":"2020-12"]

tf_all_df_cov, score_tf_cov, rmse_val_tfv_cov, explained_tf_df3_cov = tf_nn (x_train,y_train,x_test,y_test,25000,100000)

explained_tf_df = pd.concat([explained_tf_df2,explained_tf_df1,explained_tf_df3,explained_tf_df3_cov]); 
explained_tf_df.index = ["Lasso_Feature_selection", "F_Feature_Selection", "All_Features", "All feats with Covid timeline"]

In [None]:
explained_tf_df

Yeah we did the right thing, the covid clearly changed the air quality :)