#### Классификация космических кораблей по их параметрам (RandomForestClassifier)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn import tree

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv('invasion.csv')

In [5]:
df.shape

(500, 7)

In [6]:
df.head()

Unnamed: 0,class,g_reflection,i_reflection,speed,brightness,time_of_observance,volume
0,transport,2.190672,6.716633,62.168208,0.347465,158221,44.932446
1,transport,3.453276,8.995909,62.994707,0.590094,385972,41.5683
2,transport,2.432994,6.938691,62.245807,0.329288,446482,40.123467
3,fighter,6.083763,3.019459,18.474555,0.174738,210125,11.384865
4,fighter,12.876769,2.45295,195.805771,0.150446,23109,11.328806


In [7]:
X = df.drop('class', axis=1)

In [8]:
X.head()

Unnamed: 0,g_reflection,i_reflection,speed,brightness,time_of_observance,volume
0,2.190672,6.716633,62.168208,0.347465,158221,44.932446
1,3.453276,8.995909,62.994707,0.590094,385972,41.5683
2,2.432994,6.938691,62.245807,0.329288,446482,40.123467
3,6.083763,3.019459,18.474555,0.174738,210125,11.384865
4,12.876769,2.45295,195.805771,0.150446,23109,11.328806


In [9]:
y = df['class']

In [10]:
y.head()

0    transport
1    transport
2    transport
3      fighter
4      fighter
Name: class, dtype: object

In [11]:
X_test = pd.read_csv('operative_information.csv')

In [12]:
X_test.head()

Unnamed: 0,g_reflection,i_reflection,speed,brightness,time_of_observance,volume
0,7.516543,3.916691,513.954279,0.177247,105908,13.267224
1,4.322988,6.967689,63.75297,0.545922,277855,39.83313
2,4.595724,9.098297,62.233948,0.389201,160662,42.014556
3,2.689675,7.964869,62.475495,0.541081,162092,42.056829
4,8.075576,5.169719,336.441261,0.174757,466853,11.779813


In [13]:
clf_rf = RandomForestClassifier(random_state=0, criterion='entropy')

In [14]:
parameters = {'n_estimators': range(10, 50, 10),
              'max_depth': range(1, 12, 2),
              'min_samples_leaf': range(1, 7),
              'min_samples_split': range(2, 9, 2)}

In [15]:
grid_search_cv_clf = GridSearchCV(clf_rf, parameters, cv=3, n_jobs=-1)

In [16]:
grid_search_cv_clf.fit(X, y)

GridSearchCV(cv=3,
             estimator=RandomForestClassifier(criterion='entropy',
                                              random_state=0),
             n_jobs=-1,
             param_grid={'max_depth': range(1, 12, 2),
                         'min_samples_leaf': range(1, 7),
                         'min_samples_split': range(2, 9, 2),
                         'n_estimators': range(10, 50, 10)})

In [17]:
grid_search_cv_clf.best_params_

{'max_depth': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10}

In [18]:
best_clf = grid_search_cv_clf.best_estimator_

In [19]:
y_pred = best_clf.predict(X_test)

In [20]:
y_pred

array(['fighter', 'transport', 'transport', ..., 'transport', 'fighter',
       'transport'], dtype=object)

In [21]:
y_pred_series = pd.Series(y_pred)

In [22]:
y_pred_series

0         fighter
1       transport
2       transport
3       transport
4         fighter
          ...    
1495      fighter
1496      fighter
1497    transport
1498      fighter
1499    transport
Length: 1500, dtype: object

In [23]:
pd.Series(y_pred).value_counts()

fighter      675
transport    595
cruiser      230
dtype: int64

In [24]:
features_importances = best_clf.feature_importances_

In [25]:
feature_importances_df = pd.DataFrame({'feature': list(X),
                                       'feature_importance': features_importances})

In [26]:
feature_importances_df.sort_values('feature_importance', ascending=False)

Unnamed: 0,feature,feature_importance
3,brightness,0.381776
5,volume,0.29606
2,speed,0.126936
1,i_reflection,0.113814
0,g_reflection,0.081413
4,time_of_observance,0.0


In [27]:
X_test.head()

Unnamed: 0,g_reflection,i_reflection,speed,brightness,time_of_observance,volume
0,7.516543,3.916691,513.954279,0.177247,105908,13.267224
1,4.322988,6.967689,63.75297,0.545922,277855,39.83313
2,4.595724,9.098297,62.233948,0.389201,160662,42.014556
3,2.689675,7.964869,62.475495,0.541081,162092,42.056829
4,8.075576,5.169719,336.441261,0.174757,466853,11.779813


In [28]:
y_predicted_prob = best_clf.predict_proba(X_test)

In [29]:
y_predicted_prob

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [9.84137731e-04, 0.00000000e+00, 9.99015862e-01],
       [2.26401887e-02, 0.00000000e+00, 9.77359811e-01],
       ...,
       [2.26401887e-02, 0.00000000e+00, 9.77359811e-01],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [2.26401887e-02, 0.00000000e+00, 9.77359811e-01]])