## Importing the libraries

In [None]:
%pip install pycaret

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score, KFold
from pycaret.regression import *
from pycaret.datasets import get_data


## Load and Prepare Data

In [None]:
cov19=pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv')

cov19

## EDA

In [None]:
cov19.describe()

In [None]:
cov19.info()

## Data Preprocessing

In [None]:
cov19.isnull().sum()

In [None]:
cov19_fillna = cov19

In [None]:
cov19_fillna.fillna(cov19_fillna.mean(), inplace=True)
# count the number of NaN values in each column
print(cov19_fillna.isnull().sum())

cov19_fillna

In [None]:
le=LabelEncoder()
cov19['country']=le.fit_transform(cov19['country'])
cov19

In [None]:
le=LabelEncoder()
cov19['iso_code']=le.fit_transform(cov19['iso_code'])
cov19

In [None]:
le=LabelEncoder()
cov19['vaccines']=le.fit_transform(cov19['vaccines'])
cov19

In [None]:

le=LabelEncoder()
cov19['source_name']=le.fit_transform(cov19['source_name'])
cov19

In [None]:
le=LabelEncoder()
cov19['source_website']=le.fit_transform(cov19['source_website'])
cov19

In [None]:
cov19['date'] = cov19['date'].str.replace('-', ' ')
cov19

In [None]:
cov19

In [None]:
cov19.columns

## Strorytelling - Visualization

In [None]:
corr = cov19.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap='viridis', annot=True)

In [None]:
sns.pairplot(cov19)

## Prepare Data for Machine learning

In [None]:
sns.regplot( y="daily_vaccinations",x="total_vaccinations",  data=cov19)

In [None]:
sns.regplot( y="daily_vaccinations_raw",x="total_vaccinations",  data=cov19)

## Train your model

In [None]:
sns.scatterplot( y="people_fully_vaccinated",x="total_vaccinations",  data=cov19)

In [None]:

sns.scatterplot( y="people_vaccinated",x="total_vaccinations",  data=cov19)

In [None]:
sns.displot(cov19, x="country", hue="vaccines",  common_norm=False)

In [None]:
sns.scatterplot(cov19, x='vaccines',y="country")

In [None]:
sns.displot(cov19, x="country",kde=True)

In [None]:
sns.displot(cov19, x="source_name", kde=True)

In [None]:
sns.regplot( y="country",x="iso_code",  data=cov19)

## Test the model and show the metrics

In [None]:
x=cov19[['country', 'iso_code','people_vaccinated',
       'people_fully_vaccinated','daily_vaccinations_raw', 'daily_vaccinations',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred', 'daily_vaccinations_per_million',
       'vaccines', 'source_name', 'source_website']]
x

In [None]:
y=cov19[['total_vaccinations']]
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

- **supervised learning algorithms**

- [x] 1-Linear Regression

In [None]:
LR=LinearRegression()
LR.fit(x_train, y_train)

In [None]:
LR.intercept_

In [None]:
LR.coef_

In [None]:
coeffcients = pd.DataFrame([x_train.columns,LR.coef_]).T
coeffcients = coeffcients.rename(columns={0: 'Attribute',1: 'Coefficients'})
coeffcients

In [None]:
y_test

In [None]:
y_pred_LR=LR.predict(x_test)
y_pred_LR



𝑅^2 : It is a measure of the linear relationship between X and Y. It is interpreted as the proportion of the variance in the dependent variable that is predictable from the independent variable.

Adjusted 𝑅^2 :The adjusted R-squared compares the explanatory power of regression models that contain different numbers of predictors.

MAE : It is the mean of the absolute value of the errors. It measures the difference between two continuous variables, here actual and predicted values of y.

MSE: The mean square error (MSE) is just like the MAE, but squares the difference before summing them all instead of using the absolute value.

RMSE: The mean square error (MSE) is just like the MAE, but squares the difference before summing them all instead of using the absolute value.


In [None]:
# Model Evaluation
print('R^2:',metrics.r2_score(y_test, y_pred_LR))
print('MAE:',metrics.mean_absolute_error(y_test, y_pred_LR))
print('MSE:',metrics.mean_squared_error(y_test, y_pred_LR))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred_LR)))

In [None]:
plt.scatter(y_test, y_pred_LR)
plt.xlabel("total vaccines")
plt.ylabel("Predicted total vaccines")
plt.title("TOTAL VACCINES vs Predicted TOTAL VACCINES with LR")
plt.show()

In [None]:
sns.displot(y_test-y_pred_LR)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

- [x]  2-Polynomial Regression

In [None]:
"Creates a polynomial regression model for the given degree"
poly_features = PolynomialFeatures(degree=2)

# transform the features to higher degree features.
x_train_quadratic = poly_features.fit_transform(x_train)

# fit the transformed features to Linear Regression
quadratic = LinearRegression()

quadratic.fit(x_train_quadratic, y_train)

# predicting on training data-set
y_train_predicted = quadratic.predict(x_train_quadratic)

# predicting on test data-set
y_test_predicted = quadratic.predict(poly_features.transform(x_test))

In [None]:
metrics.r2_score(y_train, y_train_predicted)

In [None]:
metrics.r2_score(y_test, y_test_predicted)

In [None]:
print('MAE:',metrics.mean_absolute_error(y_test, y_test_predicted))
print('MSE:',metrics.mean_squared_error(y_test, y_test_predicted))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_test_predicted)))

In [None]:
plt.scatter(y_test, y_test_predicted)
plt.xlabel("total vaccines")
plt.ylabel("Predicted total vaccines")
plt.title("TOTAL VACCINES vs Predicted TOTAL VACCINES in PN")
plt.show()

- [x] 3-DecisionTree

In [None]:
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(x_train, y_train)

In [None]:
y_pred_DT=regressor.predict(x_test)
y_pred_DT

In [None]:
y_test

In [None]:
print('R^2:',metrics.r2_score(y_test, y_pred_DT))
print('MAE:',metrics.mean_absolute_error(y_test, y_pred_DT))
print('MSE:',metrics.mean_squared_error(y_test, y_pred_DT))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred_DT)))

In [None]:
plt.scatter(y_test, y_pred_DT)
plt.xlabel("total vaccines")
plt.ylabel("Predicted total vaccines")
plt.title("TOTAL VACCINES vs Predicted TOTAL VACCINES in DT")
plt.show()

- [x]  XGboost

In [None]:
reg_mod = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.08,
    subsample=0.75,
    colsample_bytree=1,
    max_depth=7,
    gamma=0,
)
reg_mod.fit(x_train, y_train)

In [None]:
scores = cross_val_score(reg_mod, x_train, y_train,cv=10)
print("Mean cross-validation score: %.2f" % scores.mean())

In [None]:
reg_mod.fit(x_train,y_train)

predictions = reg_mod.predict(x_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE: %f" % (rmse))

In [None]:
r2 = np.sqrt(r2_score(y_test, predictions))
print("R_Squared Score : %f" % (r2))

- **unsupervised learning algorithms**:

- [x]  k_means

In [None]:
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    #k-means++ is an algorithm for choosing the initial values (or "seeds") for the k-means clustering algorithm.
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(x)

In [None]:
kmeans.cluster_centers_

## Save your final model

- [x] **Regression analyse with Pycaret** :

PyCaret's Regression module (pycaret.regression) is a supervised machine learning module which is used for predicting continuous values / outcomes using various techniques and algorithms. Regression can be used for predicting values / outcomes such as sales, units sold, temperature or any number which is continuous.

PyCaret's regression module has over 25 algorithms and 10 plots to analyze the performance of models. Be it hyper-parameter tuning, ensembling or advanced techniques like stacking, PyCaret's regression module has it all.

In [None]:
dataset =pd.read_csv('/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv')

In [None]:
data = dataset.sample(frac=0.9, random_state=786).reset_index(drop=True)
data_unseen = dataset.drop(data.index).reset_index(drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))


In [None]:
dataset_fillna = dataset

In [None]:
dataset_fillna.fillna(dataset_fillna.mean(), inplace=True)
# count the number of NaN values in each column
print(dataset_fillna.isnull().sum())

dataset_fillna

In [None]:
le=LabelEncoder()
dataset['country']=le.fit_transform(dataset['country'])
dataset

In [None]:
le=LabelEncoder()
dataset['iso_code']=le.fit_transform(dataset['iso_code'])
dataset

In [None]:
le=LabelEncoder()
dataset['vaccines']=le.fit_transform(dataset['vaccines'])
dataset

In [None]:

le=LabelEncoder()
dataset['source_name']=le.fit_transform(dataset['source_name'])
dataset

In [None]:
le=LabelEncoder()
dataset['source_website']=le.fit_transform(dataset['source_website'])
dataset

In [None]:
dataset['date'] = dataset['date'].str.replace('-', ' ')
dataset

- a) Setting up Environment in PyCaret

The setup() function initializes the environment in pycaret and creates the transformation pipeline to prepare the data for modeling and deployment. setup() must be called before executing any other function in pycaret. It takes two mandatory parameters: a pandas dataframe and the name of the target column. All other parameters are optional and are used to customize the pre-processing pipeline (we will see them in later tutorials).

When setup() is executed, PyCaret's inference algorithm will automatically infer the data types for all features based on certain properties. The data type should be inferred correctly but this is not always the case. To account for this, PyCaret displays a table containing the features and their inferred data types after setup() is executed. If all of the data types are correctly identified enter can be pressed to continue or quit can be typed to end the expriment. Ensuring that the data types are correct is of fundamental importance in PyCaret as it automatically performs a few pre-processing tasks which are imperative to any machine learning experiment. These tasks are performed differently for each data type which means it is very important for them to be correctly configured.

In later tutorials we will learn how to overwrite PyCaret's infered data type using the numeric_features and categorical_features parameters in setup().


In [None]:
exp_reg = setup(data = dataset, target = 'total_vaccinations',session_id=123)



-  b) Comparing All Models

Comparing all models to evaluate performance is the recommended starting point for modeling once the setup is completed (unless you exactly know what kind of model you need, which is often not the case). This function trains all models in the model library and scores them using kfold cross validation for metric evaluation. The output prints a score grid that shows average MAE, MSE, RMSE, R2, RMSLE and MAPE accross the folds (10 by default) of all the available models in the model library.

In [None]:
compare_models()

- creat model

In [None]:
et = create_model('et')

In [None]:
xgboost=create_model('xgboost')

- **Tune a Model**:

When a model is created using the create_model function it uses the default hyperparameters to train the model. In order to tune hyperparameters, the tune_model function is used. This function automatically tunes the hyperparameters of a model using RandomGridSearch on a pre-defined search space. The output prints a scoring grid that shows MAE, MSE, RMSE, R2, RMSLE, and MAPE by fold. To use the custom search grid, you can pass custom_grid parameter in the tune_model function.

- Residual Plot

In [None]:
plot_model(xgboost)

- Prediction Error Plot

In [None]:
plot_model(xgboost, plot = 'error')

- Feature Importance Plot

In [None]:
plot_model(xgboost, plot='feature')

In [None]:
evaluate_model(xgboost)

In [None]:
save_model(xgboost,'cov19 Model 23Aug2023')