![](https://www.codehub.gr/wp-content/uploads/2018/01/cropped-CodeHub-logo_320x132.png)

In [1]:
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier

from sklearn.metrics import accuracy_score


import matplotlib.pyplot as plt

np.set_printoptions(precision=3, suppress=True)
pd.set_option('precision', 3)
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv("carbi.data.csv")
df = df.iloc[: , 1:]
df.dtypes

cylinders          int64
displayments     float64
horsepower       float64
weight             int64
acceleration     float64
model year         int64
cluster            int64
mpg              float64
origin_1           int64
origin_2           int64
origin_3           int64
origin             int64
car name          object
brand             object
mpg_scaled       float64
cylinders_str      int64
origin_str         int64
dtype: object

### Bagging

In [3]:
def RandomTree(source,target):


    if len(source.values.shape)==1:
        X = source.values.reshape(-1,1)
    
    else:
        X = source.values 

    y = target.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=4)  #Πειράξτε αυτή την παράμετρο, πολλές φορές μου βγάζει 100-98(overfitting)
    model.fit(X_train, y_train)
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test) 
    
    print(f'train set: {accuracy_score(y_train, pred_train)*100:.2f}%')
    print(f'test set: {accuracy_score(y_test, pred_test)*100:.2f}%')

In [4]:
RandomTree(df[['weight','horsepower','displayments','cylinders','origin','acceleration']],df['cluster'])

train set: 99.69%
test set: 98.75%


### Boosting

In [7]:
def Gradient(source,target):


    if len(source.values.shape)==1:
        X = source.values.reshape(-1,1)
    
    else:
        X = source.values 

    y = target.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = GradientBoostingClassifier(n_estimators=4)  
    model.fit(X_train, y_train)
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test) 
    
    print(f'train set: {accuracy_score(y_train, pred_train)*100:.2f}%')
    print(f'test set: {accuracy_score(y_test, pred_test)*100:.2f}%')

In [8]:
GradientBoostingClassifier(df[['weight','horsepower','displayments','cylinders','origin','acceleration']],df['cluster'])

GradientBoostingClassifier(learning_rate=0      1
1      1
2      1
3      1
4      1
      ..
393    0
394    0
395    0
396    0
397    0
Name: cluster, Length: 398, dtype: int64,
                           loss=     weight  horsepower  displayments  cylinders  origin  acceleration
0      3504       130.0         307.0          8       1          12.0
1      3693       165.0         350.0          8       1          11.5
2      3436       150.0         318.0          8       1          11.0
3      3433       150.0         304.0          8       1          12.0
4      3449       140.0         302.0          8       1          10.5
..      ...         ...           ...        ...     ...           ...
393    2790        86.0         140.0          4       1          15.6
394    2130        52.0          97.0          4       2          24.6
395    2295        84.0         135.0          4       1          11.6
396    2625        79.0         120.0          4       1          18.6
397  

### Stacking

In [3]:
def StackForGrad(source,target):


    if len(source.values.shape)==1:
        X = source.values.reshape(-1,1)
    
    else:
        X = source.values 

    y = target.values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    estimators = [ ('rf', RandomForestClassifier(n_estimators=2)), 
               ('gb', GradientBoostingClassifier(n_estimators=2))]
    stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
    stack.fit(X_train, y_train)
    pred_train = stack.predict(X_train)
    pred_test = stack.predict(X_test) 
    
    print(f'train set: {accuracy_score(y_train, pred_train)*100:.2f}%')
    print(f'test set: {accuracy_score(y_test, pred_test)*100:.2f}%')

In [4]:
StackForGrad(df[['weight','horsepower','displayments','cylinders','origin','acceleration']],df['cluster'])

train set: 100.00%
test set: 100.00%
