In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Danh sách nhóm

+ Nguyễn Xuân Vĩnh Phú
+ Đỗ Nhật Kha
+ Trần Cao Khánh Ngọc
+ Ngô Quang Bảo

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Exploratory Data Analysis (EDA)

In [None]:
train_car = pd.read_csv('../input/week1-car-acceptability/car_acc_train.csv')
train_car.dropna(inplace=True)
test_car = pd.read_csv('../input/it2034ch1502-car-acceptability-prediction/test.csv')
dev_car = pd.read_csv('../input/week1-car-acceptability/car_acc_dev_v2.csv')

In [None]:
train_car.head()

In [None]:
len(train_car), len(test_car), len(dev_car)

### Tăng cường dữ liệu

- tìm dataset có tính chất tương đương
- Extract tất cả các câu trong tập test chưa được gán nhãn và đặt làm tập dev
- Notebook: https://www.kaggle.com/ppprabbit/big-data-week1-data-augment

###  Kiểm tra dữ liệu tập train, test có trùng nhau không

In [None]:
train_temp = train_car[['buying_price', 'maintenance_price', 'number_of_doors',
       'carry_capacity', 'trunk_size', 'safety']]
test_temp = test_car[['buying_price', 'maintenance_price', 'number_of_doors',
       'carry_capacity', 'trunk_size', 'safety']]
concat_df = pd.concat((train_temp, test_temp)).drop_duplicates()
len(train_temp), len(test_temp), len(concat_df)

### Null count

In [None]:
train_car.isnull().sum()

In [None]:
dev_car.isnull().sum()

In [None]:
test_car.isnull().sum()

In [None]:
train_car.describe()

## Check feature interaction

In [None]:
buying_price = pd.crosstab(train_car['buying_price'], train_car['acceptability'])
maintenance_price = pd.crosstab(train_car['maintenance_price'], train_car['acceptability'])
number_of_doors = pd.crosstab(train_car['number_of_doors'], train_car['acceptability'])
carry_capacity = pd.crosstab(train_car['carry_capacity'], train_car['acceptability'])
trunk_size = pd.crosstab(train_car['trunk_size'], train_car['acceptability'])
safety = pd.crosstab(train_car['safety'], train_car['acceptability'])

### Buying price

In [None]:
buying_price

In [None]:
f, ax = plt.subplots(figsize=(9, 9))
stacked = buying_price.stack().reset_index().rename(columns={0:'value'})
sns.barplot(x=stacked['buying_price'], y=stacked['value'], hue=stacked['acceptability'])

### Maintenance Price

In [None]:
maintenance_price

In [None]:
f, ax = plt.subplots(figsize=(9, 9))
stacked = maintenance_price.stack().reset_index().rename(columns={0:'value'})
sns.barplot(x=stacked['maintenance_price'], y=stacked['value'], hue=stacked['acceptability'])

### Number of doors

In [None]:
number_of_doors

In [None]:
f, ax = plt.subplots(figsize=(9, 9))
stacked = number_of_doors.stack().reset_index().rename(columns={0:'value'})
sns.barplot(x=stacked['number_of_doors'], y=stacked['value'], hue=stacked['acceptability'])

### Carry capacity

In [None]:
carry_capacity

In [None]:
f, ax = plt.subplots(figsize=(9, 9))
stacked = carry_capacity.stack().reset_index().rename(columns={0:'value'})
sns.barplot(x=stacked['carry_capacity'], y=stacked['value'], hue=stacked['acceptability'])

### Trunk size

In [None]:
trunk_size

In [None]:
f, ax = plt.subplots(figsize=(9, 9))
stacked = trunk_size.stack().reset_index().rename(columns={0:'value'})
sns.barplot(x=stacked['trunk_size'], y=stacked['value'], hue=stacked['acceptability'])

### Safety

In [None]:
safety

In [None]:
f, ax = plt.subplots(figsize=(9, 9))
stacked = safety.stack().reset_index().rename(columns={0:'value'})
sns.barplot(x=stacked['safety'], y=stacked['value'], hue=stacked['acceptability'])

# Building model

## Hướng tiếp cận
+ xây dựng và tối ưu tham số trên thư viện sklearn, và dùng các tham số thay để khởi tạo mô hình Pyspark
+ Output cuối cùng của mô hình được tính bằng chồng các model lên với nhau (ensemble)

## Feature binarizing

In [None]:
label_mapper = {'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3}
label_demapper = {0: 'unacc', 1: 'acc', 2: 'good', 3: 'vgood'}

In [None]:
train_car['acceptability'].unique()

In [None]:
def build_data(df):
    X = df[['buying_price', 'maintenance_price', 'number_of_doors',
       'carry_capacity', 'trunk_size', 'safety']]
    X = pd.get_dummies(X)
    if 'acceptability' in df.columns:
        y = df['acceptability'].map(label_mapper)
        return X, y
    return X

## Build models

### Data preparation

In [None]:
X_train, y_train = build_data(train_car)
X_dev, y_dev = build_data(dev_car)

#### Random search for finding best parameters

In [None]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit

def find_best_params(params, cls_model, X_train, y_train, X_test, y_test):
    X = pd.concat((X_train, X_test))
    y = pd.concat((y_train, y_test))
    
    train_index = [-1] * len(X_train)
    test_index = [0] * len(X_test)
    
    split_index = train_index + test_index
    split = PredefinedSplit(test_fold=split_index)

    grid_model = GridSearchCV(estimator=cls_model,
                              param_grid=params,
                              cv=split,
                              scoring='accuracy',
                              verbose=20,
                              n_jobs=-1,
                            refit=False)
    grid_results = grid_model.fit(X, y)
    print('Best Score %.4f' % grid_model.best_score_)
    print('Best params : ')
    print(grid_model.best_params_)

    return grid_model, grid_results

### SVM - linear/non-linear

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV

In [None]:
svc_params = {'C': [0.05, 0.1, 1, 10, 20, 50, 100],
             'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
svm_model = svm.SVC(kernel='rbf', random_state=2021)
_, svc_results = find_best_params(svc_params, svm_model, X_train, y_train, X_dev, y_dev)

In [None]:
best_svm_cls = svm.SVC(kernel='poly', C=10, random_state=2021, probability=True)
best_svm_cls.fit(X_train, y_train)
svc_predicts = best_svm_cls.predict(X_dev)
print(classification_report(y_dev, svc_predicts, digits=4))

## XGBoost

In [None]:
import xgboost as xgb
from xgboost import plot_importance
from xgboost import XGBClassifier

In [None]:
params = {'max_depth': [3, 4, 5, 6, 7, 8],
          'learning_rate': [0.005, 0.01, 0.05, 0.1],
          'n_estimators': [500, 600, 700, 800, 900, 1000, 1200, 1300, 1400, 1500]}

xgb_cls = xgb.XGBClassifier(seed=20)
_, xgb_results = find_best_params(params, xgb_cls, X_train, y_train, X_dev, y_dev)

In [None]:
best_xgb =  xgb.XGBClassifier(n_estimators=1200,
                              max_depth=5,
                              learning_rate=0.1,
                              seed=20)
best_xgb.fit(X_train, y_train)
y_pred = best_xgb.predict(X_dev)
print(classification_report(y_dev, y_pred, digits=4))

## LightGBM 

In [None]:
from lightgbm import LGBMClassifier

light_gbm_params = {'n_estimators': [1000, 1200, 1300, 1400, 1500, 2000, 2200, 2500, 2800, 3000, 3500, 4000],
                    'boosting_type': ['gbdt', 'dart', 'goss', 'rf'],
                    'num_leaves': [6, 8, 10, 15, 20],
                    'learning_rate': [0.005, 0.01, 0.05, 0.1]}

lightgbm = LGBMClassifier(objective='multiclass',
                          num_leaves=6,
                          max_bin=200,
                          verbose=-1,
                          random_state=42)

lightgbm_best_model, lightgbm_results = find_best_params(light_gbm_params, lightgbm, X_train, y_train, X_dev, y_dev)

In [None]:
best_lgbm = LGBMClassifier(objective='multiclass',
                          num_leaves=15,
                          max_bin=2800,
#                           learning_rate=0.05,
                          boosting_type='gbdt',
                          verbose=-1,
                          random_state=42)
best_lgbm.fit(X_train, y_train)
y_pred = best_lgbm.predict(X_dev)
print(classification_report(y_dev, y_pred, digits=4))

## Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_cls = RandomForestClassifier(random_state=2021)


random_forest_params = {'n_estimators': [1200, 1300, 1500, 1700, 2000, 2200, 2400, 2500, 2700, 3000, 3200],
                        'criterion': ['gini', 'entropy'],
                        'max_features': ['auto', 'sqrt', 'log2'],
                        'class_weight': ['balanced', 'balanced_subsample']}
_, random_forest_results = find_best_params(random_forest_params, random_forest_cls, X_train, y_train, X_dev, y_dev)

In [None]:
best_rfc = RandomForestClassifier(random_state=2021, class_weight='balanced', criterion='gini',max_features='auto', n_estimators= 2000)
best_rfc.fit(X_train, y_train)
best_rfc.predict(X_dev)
print(classification_report(y_dev, y_pred, digits=4))

## Gradient boosting classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradient_boost_params = {'n_estimators': [800, 1000, 1100, 1300, 1500, 1600, 2300, 2500],
                         'learning_rate': [0.005, 0.01, 0.05, 0.1],
                         'max_depth': [3,4, 5],
                         'max_features': ['auto', 'sqrt', 'log2']}
gradient_boost_model = GradientBoostingClassifier(random_state=2021)
_, gradient_boost_results = find_best_params(gradient_boost_params, gradient_boost_model, X_train, y_train, X_dev, y_dev)

In [None]:
best_gdb = GradientBoostingClassifier(n_estimators=1300, 
                                    learning_rate=0.05,
                                    max_depth=5,
                                    max_features='auto',
                                    random_state=2021)
best_gdb.fit(X_train, y_train)
y_pred = best_gdb.predict(X_dev)
print(classification_report(y_dev, y_pred, digits=4))

### Voting ensemble model

In [None]:
from sklearn.ensemble import StackingClassifier, VotingClassifier

def voting_ensemble(X_dev, lgbm, xgb, svc, gdb, rfc):
    estimator_list = [('svc', svc),
                      ('xgb', xgb),
                      ('lgbm', lgbm),
                      ('rfc', rfc),
                      ('gdb', gdb)]

    voting_ensemble = VotingClassifier(estimator_list,
                                       voting='soft',
                                       weights=[0.15, 0.15, 0.4, 0.1, 0.15],
                                       n_jobs=-1)
    voting_ensemble.fit(X_train, y_train)
    voting_pred = voting_ensemble.predict(X_dev)
    return voting_pred

y_pred = voting_ensemble(X_dev, best_lgbm, best_xgb, best_svm_cls, best_gdb, best_rfc)
print(classification_report(y_dev, y_pred, digits=4))

## average output probs ensemble

In [None]:
def ensemble_models(X_dev, lgbm, xgb, svc, gdb, rfc):
    svc_out = svc.predict_proba(X_dev)
    xgb_out = xgb.predict_proba(X_dev)
    lgbm_out = lgbm.predict_proba(X_dev)
    gdb_out = gdb.predict_proba(X_dev)
    rfc_out = rfc.predict_proba(X_dev)

    ensemble_out = 0.4 * lgbm_out + 0.15 * xgb_out + 0.15 * svc_out + 0.15 * gdb_out + 0.1 * rfc_out
    ensemble_predicts = np.argmax(ensemble_out, axis=1)

    return ensemble_predicts

y_pred = ensemble_models(X_dev, best_lgbm, best_xgb, best_svm_cls, best_gdb, best_rfc)
print(classification_report(y_dev, y_pred, digits=4))

In [None]:
X_test = build_data(test_car)

# Build pyspark model

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [None]:
df_train =spark.read.csv("../input/week1-car-acceptability/car_acc_train.csv", header=True)
df_train = df_train.dropna()
df_test = spark.read.csv("../input/it2034ch1502-car-acceptability-prediction/test.csv", header=True)

In [None]:
df_train.show(5)

### Feature tranformation

In [None]:
indexers = [StringIndexer(inputCol=column, outputCol=column + "_category").fit(df_train) for column in
            df_train.columns[1:-1]]
pipeline = Pipeline(stages=indexers)
transformer = pipeline.fit(df_train)

train = transformer.transform(df_train)
test = transformer.transform(df_test)

label_indexer = StringIndexer(inputCol='acceptability', outputCol='acceptability_category').fit(df_train)
train = label_indexer.transform(train)

# transform to feature
feature_assembler = VectorAssembler(inputCols=train.columns[8:14], outputCol="features")
train = feature_assembler.transform(train)
test_feature_assembler = VectorAssembler(inputCols=test.columns[7:13], outputCol="features")
test = test_feature_assembler.transform(test)

### SVC model

In [None]:
from sklearn.metrics import classification_report
from sklearn import svm

best_svm_cls = svm.SVC(kernel='poly', C=10, random_state=2021, probability=True)
best_svm_cls.fit(X_train, y_train)
svc_predicts = best_svm_cls.predict(X_test)

In [None]:
svc_df = pd.DataFrame({'car_id': test_car['car_id'], 'acceptability': svc_predicts})
svc_df['acceptability'] = svc_df['acceptability'].map(label_demapper)

## Random Forest model

In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
rdf = RandomForestClassifier(labelCol="acceptability_category", featuresCol="features", maxDepth=10, seed=465, numTrees=2000, impurity='gini')
rdf_model = rdf.fit(train)
rdf_predict = rdf_model.transform(test)

In [None]:
rdf_predict.select('car_id', 'prediction').show(5)

In [None]:
rdf_predict['prediction'].unique()

In [None]:
rdf_predict = rdf_predict.toPandas()
rdf_predict.rename({'prediction': 'acceptability'}, inplace=True, axis=1)
lb_demapper = {0.0: 'unacc', 1.0: 'acc', 2.0: 'good', 3.0: 'vgood'}
rdf_predict['acceptability'] = rdf_predict['acceptability'].map(lb_demapper)

In [None]:
rdf_predict.head()

In [None]:
rdf_predict = rdf_predict[['car_id', 'acceptability']]

## Gradient boosting model

In [None]:
best_gdb = GradientBoostingClassifier(n_estimators=1300, 
                                    learning_rate=0.05,
                                    max_depth=5,
                                    max_features='auto',
                                    random_state=2021)
best_gdb.fit(X_train, y_train)
print()
gdb_pred = best_gdb.predict(X_test)

In [None]:
gdb_predict = pd.DataFrame({'car_id': test_car['car_id'], 'acceptability': gdb_pred})
gdb_predict['acceptability'] = gdb_predict['acceptability'].map(label_demapper)

## XGBoost model

In [None]:
best_xgb =  xgb.XGBClassifier(n_estimators=1200,
                              max_depth=5,
                              learning_rate=0.1,
                              seed=20)
best_xgb.fit(X_train, y_train)
xgb_pred = best_xgb.predict(X_test)

In [None]:
xgb_predict = pd.DataFrame({'car_id': test_car['car_id'], 'acceptability': xgb_pred})
xgb_predict['acceptability'] = xgb_predict['acceptability'].map(label_demapper)

In [None]:
xgb_predict.head(), gdb_predict.head(), rdf_predict.head(), svc_df.head()

In [None]:
len(svc_df)

### Voting model 

In [None]:
from collections import Counter

In [None]:
def voting_classifer(xgb_predict, gdb_predict, rdf_predict, svc_df):
    voting_output = pd.concat(
        (xgb_predict, gdb_predict['acceptability'], rdf_predict['acceptability'], svc_df['acceptability']), axis=1)
    data = []
    for i, row in voting_output.iterrows():
        counter = Counter()
        counter = counter.update(row['acceptability'])
        voting_label = max(counter, key=counter.get)
        data.append((row['car_id'], voting_label))
    data = pd.DataFrame(data, columns=['car_id', 'acceptability'])
    return data

## Get File for prediction

In [None]:
# data = voting_classifer(xgb_predict, gdb_predict, rdf_predict, svc_df)
# data.to_csv('submit.csv', index=False)

y_pred = ensemble_models(X_dev, best_lgbm, best_xgb, best_svm_cls, best_gdb, best_rfc)
test_label = [label_demapper[w] for w in y_pred]

submit_df = pd.DataFrame({'car_id': test_car['car_id'], 'acceptability': test_label})
submit_df.to_csv('submit.csv', index=False)

