In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [None]:
df=pd.read_csv('../input/data-preprocess/dp.csv' , skiprows=[0,1])

In [None]:
df

In [None]:
df=df.drop(['CountryCode','1',' CountryName'], axis=1)

In [None]:
df.columns

In [None]:
df

In [None]:
df.shape

# dealing with missing values

In [None]:
df.isnull().sum()

In [None]:
df_bfill=df.fillna(method='bfill')

In [None]:
df_ffill=df.fillna(method='ffill')

In [None]:
df2=df.copy()

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer()
df.iloc[:,0:3]=imputer.fit_transform(df.iloc[:,0:3])

In [None]:
from sklearn.impute import KNNImputer
knn_imputer=KNNImputer(n_neighbors=2)
df.iloc[:,0:3]=knn_imputer.fit_transform(df.iloc[:,0:3])

In [None]:
df.isnull().sum()

#  dealing with categorical variable

In [None]:
df=pd.get_dummies(df , columns=['International Visitors'] , drop_first=True)

In [None]:
df

In [None]:
df.describe()

# feature scaling

In [None]:
df_minmax=df.copy()
df_standard=df.copy()

In [None]:
from sklearn.preprocessing import MinMaxScaler
mmscaler=MinMaxScaler()
df_minmax.iloc[:,0:3]=pd.DataFrame(mmscaler.fit_transform(df.iloc[: , 0:3]), columns=df.iloc[:,0:3].columns)
df_minmax.head(2)

In [None]:
from sklearn.preprocessing import StandardScaler
sscaler=StandardScaler()
df_standard.iloc[:,0:3]=pd.DataFrame(sscaler.fit_transform(df.iloc[:,0:3]), columns=df.iloc[:,0:3].columns)
df_standard.head(2)

# dealing with outliers

In [None]:
fig , axes = plt.subplots(nrows=4 , ncols=1 )
sns.boxplot(df['Population growth'],ax=axes[0])
sns.boxplot(df['Total population'],ax=axes[1])
sns.boxplot(df['Area (sq. km)'] ,ax=axes[2])
sns.boxplot(df['Coronavirus Cases'] ,ax=axes[3])
plt.show()

In [None]:
from scipy import stats
z_score=np.abs(stats.zscore(df))

In [None]:
np.where(z_score>3) 
# so the row=0 and column=2
# and row=14 and column=1 
# are outliers

In [None]:
df_noutlier_z=df[(z_score<3).all(axis=1)]

In [None]:
df_noutlier_z.shape

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
cut_off = IQR * 1.5
lower, upper = Q1 - cut_off, Q3 + cut_off

In [None]:
print(df < (Q1 - 1.5 * IQR))  or (df > (Q3 + 1.5 * IQR))

In [None]:
df_noutlier_iqr=df[((df >= lower) & (df <= upper)).all(axis=1)]
df_noutlier_iqr.shape

# building linear model 

In [None]:
x=df_standard.drop(['Coronavirus Cases'] , axis=1)
y=df_standard['Coronavirus Cases']

In [None]:
#linear regression without feature scaling and with outliers - knn imputed and encoded
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x,y)

In [None]:
from sklearn import metrics
y_pred=model.predict(x)
print(model.coef_)
print(model.intercept_)
plt.scatter(y_pred,y)
mse=metrics.mean_squared_error(y,y_pred)
rmse=np.sqrt(mse)
print('MSE is :' , mse)
print('RMSE is :', rmse)

In [None]:
ys=pd.DataFrame({'real':y ,
                 'prediction':y_pred})

# statsmodel

In [None]:
import statsmodels.api as smf
model_s=smf.OLS(y,x).fit()

In [None]:
model_s.params

In [None]:
model_s.pvalues

In [None]:
model_s.summary()

# polynomial regression

In [None]:
x=df['Total population']
y=df['Coronavirus Cases']

In [None]:
plt.scatter(x,y)

In [None]:
x = x[:, np.newaxis]
y = y[:, np.newaxis]

In [None]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_features= PolynomialFeatures(degree=2)
x_poly = polynomial_features.fit_transform(x.reshape(-1,1))

In [None]:
x_poly

In [None]:
model_p = LinearRegression()
model_p.fit(x_poly, y)
y_poly_pred = model_p.predict(x_poly)

rmse = np.sqrt(metrics.mean_squared_error(y,y_poly_pred))
r2 = metrics.r2_score(y,y_poly_pred)
print(rmse)
print(r2)

In [None]:
model_p.coef_