# Note: this notebook was run on Kaggle, so you need to modify path to use in Colab

# Lab 2-1: Groupby operations

Some imports:

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
try:
    import seaborn
except ImportError:
    pass

pd.options.display.max_rows = 10

## Some 'theory': the groupby operation (split-apply-combine)

The "group by" concept: we want to **apply the same function on subsets of your dataframe, based on some key to split the dataframe in subsets**

This operation is also referred to as the "split-apply-combine" operation, involving the following steps:

* **Splitting** the data into groups based on some criteria
* **Applying** a function to each group independently
* **Combining** the results into a data structure

Similar to SQL `GROUP BY`

The example of the image in pandas syntax:

In [None]:
df = pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],
                   'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]})
df

Using the filtering and reductions operations we have seen in the previous notebooks, we could do something like:


    df[df['key'] == "A"].sum()
    df[df['key'] == "B"].sum()
    ...

But pandas provides the `groupby` method to do this:

In [None]:
df.groupby('key').aggregate(np.sum)  # 'sum'

In [None]:
df.groupby('key').sum()

And many more methods are available. 

## And now applying this on some real data

We go back to the titanic survival data:

In [None]:
!git clone https://gist.github.com/michhar/2dfd2de0d4f8727f873422c5d959fff5
# Here is just the sample dir, you should correct your dir
df = pd.read_csv("./2dfd2de0d4f8727f873422c5d959fff5/titanic.csv")

In [None]:
df.head()

<div class="alert alert-success">
    <b>EXERCISE</b>: Using groupby(), calculate the average age for each sex.
</div>

In [None]:
df.groupby("Sex")["Age"].mean()

<div class="alert alert-success">
    <b>EXERCISE</b>: Calculate the average survival ratio for all passengers.
</div>

In [None]:
df["Survived"].value_counts(normalize=True)

<div class="alert alert-success">
    <b>EXERCISE</b>: Calculate this survival ratio for all passengers younger that 25 (remember: filtering/boolean indexing).
</div>

In [None]:
df[df["Age"] <= 25]["Survived"].mean()

<div class="alert alert-success">
    <b>EXERCISE</b>: Is there a difference in this survival ratio between the sexes? (tip: write the above calculation of the survival ratio as a function)
</div>

In [None]:
df.groupby("Sex")["Survived"].mean()

<div class="alert alert-success">
    <b>EXERCISE</b>: Make a bar plot of the survival ratio for the different classes ('Pclass' column).
</div>

In [None]:
import seaborn as sns
sns.countplot(x="Pclass", hue="Survived", data=df)

If you are ready, more groupby exercises can be found in the "Advanded groupby operations" notebook.

# 2-3 KNN for Diabetes

Predict if a person gets infected by diabetes. Use the Diabetes dataset.

In [None]:
# load data
import sklearn
from sklearn import datasets
dataset = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
dataset

## 1) Data discovery

In [None]:
# get how many instances (rows) and how many attributes (columns)
dataset.shape

In [None]:
# show basic info: max, min, mean of dataset columns
dataset.describe()

In [None]:
# display statistical data of columns (including categorical columns)
dataset.describe(include = 'all')

In [None]:
# show some first rows
dataset.head(5)

In [None]:
# show some last rows
dataset.tail(3)

In [None]:
# numbers of instances (rows) that belong to each class. 
dataset.groupby("Outcome").size()

In [None]:
# Another way
dataset["Outcome"].value_counts()

### Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Pairplot

In [None]:
sns.pairplot(dataset, hue="Outcome", height=3, markers=["o", "s"])

### Boxplot

In [None]:
plt.figure()
dataset.boxplot(by="Outcome", figsize=(15, 10))

## 2) Data preprocessing

In [None]:
dataset.columns

In [None]:
X = dataset[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']].values
y = dataset["Outcome"].values

####  Spliting dataset into training set and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

## 3) Using KNN for classification

####  Build model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

#### Change ``k`` to find the best value

In [None]:
# Setup arrays to store training and test accuracies
neighbors = np.arange(1, 11)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i, k in enumerate(neighbors):
    # Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # Fit the model
    knn.fit(X_train, y_train)
    
    # Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)
    
    # Compute accuracy on the test set
    test_accuracy[i] = knn.score(X_test, y_test)

In [None]:
# Generate plot
plt.title('k-NN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

#### Choose ``k = 6``

In [None]:
classifier = KNeighborsClassifier(n_neighbors=6)
classifier.fit(X_train, y_train)

####  Prediction

In [None]:
# Predicting on the test set
y_pred = classifier.predict(X_test)
y_pred

##### Accuracy

In [None]:
# Using accuracy_score
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Accuracy of our model is equal {round(accuracy, 2)}%.")

In [None]:
knn.score(X_test, y_test)

In [None]:
seeds = list(range(20))
for seed in seeds:
    from sklearn.model_selection import train_test_split
    X = dataset[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']].values
    y = dataset["Outcome"].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    # Setup arrays to store training and test accuracies
    neighbors = np.arange(1, 35)
    train_accuracy = np.empty(len(neighbors))
    test_accuracy = np.empty(len(neighbors))

    for i, k in enumerate(neighbors):
        # Setup a knn classifier with k neighbors
        knn = KNeighborsClassifier(n_neighbors=k)

        # Fit the model
        knn.fit(X_train, y_train)

        # Compute accuracy on the training set
        train_accuracy[i] = knn.score(X_train, y_train)

        # Compute accuracy on the test set
        test_accuracy[i] = knn.score(X_test, y_test)
    mean_acc = np.array(test_accuracy)
    print(f"Seed: {seed}, best test acc mean: {np.max(mean_acc)}, best k: {np.argmax(mean_acc)}")

# Conclusion
KNN is extremely robust to the data. With different split, we get different result. See code above.

I personally not recommend using KNN as baseline because unstability.


# Lab 2-4: CustomerChurn & BigMart Sales

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

customer = pd.read_csv("../input/ai-lab-24/CustomerChurn.csv")
customer

In [None]:
customer["International plan"] = LabelEncoder().fit_transform(customer["International plan"])
customer["Voice mail plan"] = LabelEncoder().fit_transform(customer["Voice mail plan"])
customer["Churn"] = LabelEncoder().fit_transform(customer["Churn"])
customer["State"] = LabelEncoder().fit_transform(customer["State"])
customer

In [None]:
import seaborn as sns
sns.countplot("Churn", data=customer)

Due to imbalance label, we should use stratified split.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    customer.drop(["Churn"], axis=1), customer["Churn"], test_size=0.2, random_state=42, stratify = customer["Churn"])

## KNN baseline

In [None]:
k_nei = list(range(1, 30))
tr, val = [], []
for i in k_nei:
    knn = KNeighborsClassifier(n_neighbors=i).fit(X_train, y_train)
    tr.append(knn.score(X_train, y_train))
    val.append(knn.score(X_test, y_test))

import matplotlib.pyplot as plt
plt.plot(tr)
plt.plot(val)

## Stacking Classifer, an advanced technique

In [None]:
gaussnb = GaussianNB().fit(X_train, y_train)
knn = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
lr = LogisticRegression(max_iter=5000).fit(X_train, y_train)
rf = RandomForestClassifier(n_estimators=300, random_state=42).fit(X_train, y_train) # My favorite algorithm, actually it's GBDT
gaussnb.score(X_test, y_test), knn.score(X_test, y_test), lr.score(X_train, y_train), rf.score(X_test, y_test)

In [None]:
estimators =[
    ("rf", GaussianNB()),
    ("knn", KNeighborsClassifier(n_neighbors=15)),
    ("gaussnb", RandomForestClassifier(random_state=41569))
]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(max_iter=5000))
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

# BigMart sales data

In [None]:
train = pd.read_csv("../input/bigmart-sales-data/Train.csv")
test = pd.read_csv("../input/bigmart-sales-data/Test.csv")
sales = pd.concat([train, test], axis=0)
sales

In [None]:
sales.isna().sum()

## The best way to fill NaN values, to me, is not to fill it.

In [None]:
train["Outlet_Size"].fillna("NotAvail", inplace=True)
test["Outlet_Size"].fillna("NotAvail", inplace=True)
sales["Outlet_Size"].fillna("NotAvail", inplace=True)
sales.isna().sum()

In [None]:
sales.columns
sales[['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type']]

In [None]:
sales["Item_Fat_Content"].unique()
d = {
    "Low Fat": 0,
    "Regular": 1,
    "low fat": 0,
    "LF": 0,
    "reg": 1
}
sales["Item_Fat_Content"] = sales["Item_Fat_Content"].map(d)
train["Item_Fat_Content"] = train["Item_Fat_Content"].map(d)
test["Item_Fat_Content"] = test["Item_Fat_Content"].map(d)
sales

In [None]:
sales.describe()

In [None]:
categorical_cols = ["Item_Identifier", "Item_Fat_Content", "Item_Type", "Outlet_Identifier", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type"]
for col in categorical_cols:
    print(sales[col].value_counts(dropna=False))

# Baselines
CatBoost is a GBDT library like XGBoost or LightGBM. This is my favorite ML algorithm.

NOT AVAILABLE IN COLAB YET!

In [None]:
from sklearn.model_selection import train_test_split
train, val, train_labels, val_labels  = train_test_split(train.drop("Item_Outlet_Sales", axis=1), 
                                                         train["Item_Outlet_Sales"], random_state=34125, test_size=0.2)

In [None]:
import catboost as cb

train_pool = cb.Pool(train, train_labels, cat_features=categorical_cols)
val_pool = cb.Pool(val, val_labels, cat_features=categorical_cols)
test_pool = cb.Pool(test, cat_features=categorical_cols)

In [None]:
model = cb.CatBoostRegressor(iterations=1000)
model.fit(train_pool, eval_set=val_pool, verbose=100, plot=True)

In [None]:
sub = test[["Item_Identifier", "Outlet_Identifier"]]
sub["Item_Outlet_Sales"] = model.predict(test_pool).clip(0)
sub.to_csv("submission.csv", index=False)
sub

# Public leaderboard RMSE score: 1151
Test score is not available until competition ends. https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii