# Fun with Pokemon Dataset

## Thanks for checking it out, leave a comment.
------

## Notebook Preparation

Importing Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('max.columns', None)
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn import metrics
from sklearn.metrics import roc_curve, f1_score, accuracy_score, precision_recall_curve, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

%matplotlib inline

## Data Importing

In [None]:
df = pd.read_csv('../input/Pokemon.csv', low_memory=False)

## Basic EDA

In [None]:
df.info()

In [None]:
df.head()

In [None]:
## Percentage of Legendaries in the dataset
print('Legendary:',str(len(df[df['Legendary'] == True]) / len(df) * 100) + '%')

------

## Data Visualization

### Pokemon Type 1 count plot

In [None]:
plt.title('Count Plot')
plt.xticks(rotation = 45)
sns.countplot(df['Type 1'])

# Expected Fire type to be the highest

### Pokemon Type 2 count plot

In [None]:
plt.title('Count Plot')
plt.xticks(rotation = 45)
sns.countplot(df['Type 2'])

### Distribution plot of Pokemon Total

In [None]:
sns.distplot(df['Total'])

In [None]:
## Break down of the Generations
df['Generation'].value_counts()

## Pair-plot to understand linear relationships

In [None]:
sns.pairplot(df[['HP','Attack','Defense','Sp. Atk','Sp. Def','Speed']])

-------

## Correlation Matrix

In [None]:
corr = df.corr()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap='coolwarm', annot=True)

-------

## More EDA

In [None]:
df.describe()

In [None]:
df[df['Name'].duplicated()] # no dupliactes

In [None]:
pd.crosstab(df['Type 1'] , df['Legendary'])

In [None]:
for i in df.columns:
    print(i, len(df[i].unique()))

-------

## Data Transformation

In [None]:
df['Legendary'] = df['Legendary'].apply(lambda x: 1 if x == True else 0)

In [None]:
dataset = df.iloc[:, 2:]

In [None]:
dataset.head()

In [None]:
dataset = pd.get_dummies(dataset, dummy_na=True,drop_first=True)
dataset['Target'] = dataset['Legendary']
dataset.drop(['Legendary', 'Total'], inplace=True, axis=1)

------

## Machine Learning

In [None]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [None]:
y.head(2)

In [None]:
X.head()

In [None]:
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
clr = LogisticRegression()

In [None]:
clr.fit(X_train, y_train)

In [None]:
y_pred = clr.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

In [None]:
probs = clr.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

In [None]:
## SVC

In [None]:
svc = SVC(probability=True)

In [None]:
svc.fit(X_train, y_train)

In [None]:
svc_probs = svc.predict_proba(X_test)
svc_preds = svc_probs[:,1]
svc_fpr, svc_tpr, svc_threshold = metrics.roc_curve(y_test, svc_preds)
svc_roc_auc = metrics.auc(svc_fpr, svc_tpr)

In [None]:
svc_y_pred = svc.predict(X_test)

In [None]:
accuracy_score(y_test, svc_y_pred)

In [None]:
tpr

In [None]:
svc_tpr

In [None]:
cm

In [None]:
svc_cm = confusion_matrix(y_test, svc_y_pred)

In [None]:
svc_cm

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'LR AUC = %0.2f' % roc_auc)
plt.plot(svc_fpr, svc_tpr, 'g', label = 'SVC AUC = %0.2f' % svc_roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

#### Thank You!