In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

customer = pd.read_csv("../input/ai-lab-24/CustomerChurn.csv")
customer

In [None]:
customer["International plan"] = LabelEncoder().fit_transform(customer["International plan"])
customer["Voice mail plan"] = LabelEncoder().fit_transform(customer["Voice mail plan"])
customer["Churn"] = LabelEncoder().fit_transform(customer["Churn"])
customer["State"] = LabelEncoder().fit_transform(customer["State"])
customer

In [None]:
import seaborn as sns
sns.countplot("Churn", data=customer)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    customer.drop(["Churn"], axis=1), customer["Churn"], test_size=0.2, random_state=42, stratify = customer["Churn"])

In [None]:
k_nei = list(range(1, 30))
tr, val = [], []
for i in k_nei:
    knn = KNeighborsClassifier(n_neighbors=i).fit(X_train, y_train)
    tr.append(knn.score(X_train, y_train))
    val.append(knn.score(X_test, y_test))

import matplotlib.pyplot as plt
plt.plot(tr)
plt.plot(val)

In [None]:
gaussnb = GaussianNB().fit(X_train, y_train)
knn = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
lr = LogisticRegression(max_iter=5000).fit(X_train, y_train)
rf = RandomForestClassifier(n_estimators=300, random_state=42).fit(X_train, y_train)
gaussnb.score(X_test, y_test), knn.score(X_test, y_test), lr.score(X_train, y_train), rf.score(X_test, y_test)

In [None]:
estimators =[
    ("rf", GaussianNB()),
    ("knn", KNeighborsClassifier(n_neighbors=15)),
    ("gaussnb", RandomForestClassifier(random_state=41569))
]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(max_iter=5000))
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
train = pd.read_csv("../input/bigmart-sales-data/Train.csv")
test = pd.read_csv("../input/bigmart-sales-data/Test.csv")
sales = pd.concat([train, test], axis=0)
sales

In [None]:
sales.isna().sum()

In [None]:
train["Outlet_Size"].fillna("NotAvail", inplace=True)
test["Outlet_Size"].fillna("NotAvail", inplace=True)
sales["Outlet_Size"].fillna("NotAvail", inplace=True)
sales.isna().sum()

In [None]:
sales.columns
sales[['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type']]

In [None]:
sales["Item_Fat_Content"].unique()
d = {
    "Low Fat": 0,
    "Regular": 1,
    "low fat": 0,
    "LF": 0,
    "reg": 1
}
sales["Item_Fat_Content"] = sales["Item_Fat_Content"].map(d)
sales_test["Item_Fat_Content"] = sales_test["Item_Fat_Content"].map(d)
sales

In [None]:
sales.describe()

In [None]:
categorical_cols = ["Item_Identifier", "Item_Fat_Content", "Item_Type", "Outlet_Identifier", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type"]
for col in categorical_cols:
    print(sales[col].value_counts(dropna=False))

# Baselines

CatBoost is a GBDT library like XGBoost or LightGBM.

In [None]:
from sklearn.model_selection import train_test_split
train, val, train_labels, val_labels  = train_test_split(train.drop("Item_Outlet_Sales", axis=1), 
                                                         train["Item_Outlet_Sales"], random_state=34125, test_size=0.2)

In [None]:
import catboost as cb

train_pool = cb.Pool(train, train_labels, cat_features=categorical_cols)
val_pool = cb.Pool(val, val_labels, cat_features=categorical_cols)
test_pool = cb.Pool(test, cat_features=categorical_cols)

In [None]:
model = cb.CatBoostRegressor(iterations=1000)
model.fit(train_pool, eval_set=val_pool, verbose=100)

In [None]:
model.predict(test_pool)

In [None]:
sub = test[["Item_Identifier", "Outlet_Identifier"]]
sub["Item_Outlet_Sales"] = model.predict(test_pool).clip(0)
sub.to_csv("submission.csv", index=False)
sub

# Baseline RMSE score: 1151
Test score is not available until competition ends.
https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii