# WORKING WITH MALARIA DATASET

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
reported = pd.read_csv("../input/malaria-dataset/reported_numbers.csv")

In [None]:
df = pd.DataFrame(reported)
df.head(15)

## DATA CLEANING

In [None]:
len(df.Country.unique())

In [None]:
df.Year.value_counts()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.shape

## DATA VISUALISATION AND ANALYSIS

### ANALYSING AND VISUALISING DATA GROUPED COUNTRY WISE

In [None]:
df1 = df.groupby('Country')["No. of cases","No. of deaths"].sum().reset_index()
df1.head()

In [None]:
df2 = df1[["Country","No. of cases"]]
df2.head()

In [None]:
df2 = df2.sort_values("No. of cases", ascending = False)

In [None]:
#the following function allows me to print the data on the top of my bars
def data_graph (axis, width, height):
    axis.spines['top'].set_visible(False)
    axis.spines['right'].set_visible(False)
    for p in axis.patches:
        axis.annotate ("{0:.1f}".format(p.get_height()), (p.get_x()+width, p.get_height()+height))

In [None]:
plt.figure(figsize = (25,10))
ax = sns.barplot(x="Country", y= "No. of cases", data=df2[:20] ,palette = "Spectral")
plt.title ("Top 20 Countries with Most Number of Cases")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("Country")
plt.ylabel("No. of Cases (1 unit = 10000000)")
data_graph(ax,0.1,10)
plt.show()

In [None]:
df3 = df1[["Country","No. of deaths"]]
df3.head()

In [None]:
df3 = df3.sort_values("No. of deaths", ascending = False)

In [None]:
plt.figure(figsize = (25,10))
ax = sns.barplot(x="Country", y= "No. of deaths", data=df3[:20] ,palette = "cubehelix")
plt.title ("Top 20 Countries with Most Number of deaths")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("Country")
plt.ylabel("No. of deaths")
data_graph(ax,0.1,10)
plt.show()

### ANALYSING AND VISUALISING DATA GROUPED WHO REGION WISE

In [None]:
df4 = df.groupby("WHO Region")["No. of cases","No. of deaths"].sum().reset_index()
df4.head()

In [None]:
df5 = df4[["WHO Region","No. of cases"]]
df5.head()

In [None]:
plt.figure(figsize = (10,5))
ax = sns.barplot(x="WHO Region", y= "No. of cases", data=df5 ,palette = "icefire")
plt.title ("WHO Regions and amount of cases in each region")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("WHO Regions")
plt.ylabel("No. of Cases (1 unit = 10000000)")
data_graph(ax,0.1,10)
plt.show()

In [None]:
df6 = df4[["WHO Region","No. of deaths"]]
df6.head()

In [None]:
plt.figure(figsize = (10,5))
ax = sns.barplot(x="WHO Region", y= "No. of deaths", data=df6 ,palette = "coolwarm")
plt.title ("WHO Regions and amount of deaths in each region")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("WHO Regions")
plt.ylabel("No. of deaths")
data_graph(ax,0.1,10)
plt.show()

### ANALYSING AND VISUALISING DATA GROUPED YEAR WISE

In [None]:
df7 = df.groupby("Year")["No. of cases","No. of deaths"].sum().reset_index()
df7.head()

In [None]:
df8 = df7.set_index(["Year"])

In [None]:
plt.figure(figsize = (10,5))
ax = sns.lineplot(data=df8)
plt.title ("Year wise Data")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("Year")
plt.ylabel("(1 unit = 1000000)")
plt.show()

In [None]:
plt.figure(figsize = (10,5))
ax = sns.lineplot(x = "Year",y= "No. of cases",data=df7)
plt.title ("Yearly Cases")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("Year")
plt.ylabel("No. of Cases (1 unit = 1000000)")
plt.show()

In [None]:
plt.figure(figsize = (10,5))
ax = sns.lineplot(x = "Year",y= "No. of deaths",data=df7)
plt.title ("Yearly Deaths")
plt.xticks(rotation = 60, ha = 'right')
plt.xlabel("Year")
plt.ylabel("No. of deaths")
plt.show()

## WORKING WITH MODELS

For our models we use Random Forest, KNN and Stochastic Gradient Decent. With the Random forest algorithm we find the R2 score, by using KNN we use confusion matrix with it, and understand the use of it. For Stochatic Gradient Decent we create a ROC curve in order to calculate the auccracy of our model.

### ENCODING DATASET TO MAKE IT FIT FOR FITTING INTO MODELS

In [None]:
pred_df1 = df['Country'].str.get_dummies()
pred_df2 = df['WHO Region'].str.get_dummies()

In [None]:
frames = [df,pred_df1,pred_df2]
pred_df = pd.concat(frames,axis=1)
pred_df.head()

In [None]:
cols = ["Country","WHO Region"]
pred_df.drop(columns = cols, inplace=True)
pred_df.shape

### USING RANDOM FOREST AND FINDING R2 SCORE

In [None]:
x = pred_df.drop("No. of deaths",axis=1)
y = pred_df[["No. of deaths"]]

from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x = std.fit_transform(x)
y = std.fit_transform(y)

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state = 1)
print("x_train :",x_train.shape)
print("x_test :",x_test.shape)
print("y_train :",y_train.shape)
print("y_test :",y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_train,y_train)
preds = model.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
r = r2_score(y_test,preds)
print("R2score when we predict using Randomn forest is ",r)

### USING KNN AND CONFUSION MATRIX

In [None]:
x = pred_df.drop("No. of deaths",axis=1)
y = pred_df[["No. of deaths"]]

from sklearn import preprocessing
x = preprocessing.normalize(x)
y = preprocessing.normalize(y)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train,y_train)

In [None]:
knn.score(x_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = knn.predict(x_test)
con_mat = confusion_matrix(y_test,y_pred)
con_mat

In [None]:
import math
tn = con_mat[0][0]
fn = con_mat[1][0]
fp = con_mat[0][1]
tp = con_mat[1][1]

sensitivity = tp/(tp+fn)
specificity = tn/(tn+fp)
precision = tp/(fp+tp)
recall = tp/(fn+tp)
accuracy = (tp+tn)/(tp+tn+fp+fn)
f1 = 2*(precision*recall)/(precision+recall)
fpr = fp/(tn+fp)
fnr = fn/(tp+fn)
npv = tn/(tn+fn)
fdr = fp/(fp+tp)
mcc = (tp*tn)-(fp*fn)/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))

print("Precision = {}, Accuracy = {}, F1 = {}, FPR = {}, FNR = {}, NPV = {}, FDR = {}, MCC = {}".format(precision,accuracy,f1,fpr,fnr,npv,fdr,mcc))
print()
print ("Sensitivity = {}, Specificity = {}".format(sensitivity, specificity))

### USING STOCHASTIC GRADIENT DECENT AND ROC CURVE

In [None]:
x = pred_df.drop("No. of deaths",axis=1)
y = pred_df[["No. of deaths"]]

from sklearn import preprocessing
x = preprocessing.normalize(x)
y = preprocessing.normalize(y)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
from sklearn.linear_model import SGDClassifier 

from sklearn.metrics import roc_curve, auc

model = SGDClassifier(loss='hinge', class_weight='balanced')
model.fit(x_train, y_train)

y_train_pred = model.decision_function(x_train)    
y_test_pred = model.decision_function(x_test) 

In [None]:
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred)

plt.grid()

plt.plot(train_fpr, train_tpr, label=" AUC TRAIN ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label=" AUC TEST ="+str(auc(test_fpr, test_tpr)))
plt.plot([0,1],[0,1],'g--')
plt.legend()
plt.xlabel("True Positive Rate")
plt.ylabel("False Positive Rate")
plt.title("AUC(ROC curve)")
plt.show()
