Robots are smart… by design. To fully understand and properly navigate a task, however, they need input about their environment.

Help robots recognize the floor surface they’re standing on using data collected from Inertial Measurement Units (IMU sensors).

In [None]:
## importing libraries required
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, train_test_split, KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go

In [None]:
# Change directory
path = "../input"
os.chdir(path)
print( path )

In [None]:
#read Train and Test Data
X_train_robo=pd.read_csv('X_train.csv')
X_test_robo=pd.read_csv('X_test.csv')
y_train_robo=pd.read_csv('y_train.csv')
sample = pd.read_csv('sample_submission.csv')

## *Viewing the Train and Test data

In [None]:
X_train_robo.head()

In [None]:
y_train_robo.head()

### y_train.csv - the surfaces for training set.

- series_id: ID number for the measurement series.

- group_id: ID number for all of the measurements taken in a recording session. Provided for the training set only, to enable more cross validation strategies.

- surface: the target for this competition.

In [None]:
X_test_robo.head()

### X_[train/test].csv - the input data, covering 10 sensor channels and 128 measurements per time series plus three ID columns:

- row_id: The ID for this row.

- series_id: ID number for the measurement series. Foreign key to y_train/sample_submission.

- measurement_number: Measurement number within the series.
- sensor channels: 
        orientation_X
        orientation_Y
        orientation_Z
        orientation_W
        angular_velocity_X
        angular_velocity_Y
        angular_velocity_Z
        linear_acceleration_X
        linear_acceleration_Y
        linear_acceleration_Z

## Size of Dataset

In [None]:
print("X_Train : " , X_train_robo.shape)
print("X_Test : " , X_test_robo.shape)
print("y_Train : " , y_train_robo.shape)

### we can observe predictors and target has different number of rows.

In [None]:
print("Train series count : %d" % len(X_train_robo.series_id.value_counts()))
print("Test series count : %d" % len(X_test_robo.series_id.value_counts()))

In [None]:
print("Train measurement_number count : %d" % len(X_train_robo.measurement_number.value_counts()))
print("Test measurement_number count : %d" % len(X_test_robo.measurement_number.value_counts()))

### we can see Test data has 6 extra series than Train data
### Train set has measurement  series count of 3810 with each of 128 measurement number

## check for missing, NA values

In [None]:
print("X_Train : \n" , X_train_robo.isna().sum())
print("---------------------------")
print("X_Test : \n" , X_test_robo.isna().sum())
print("---------------------------")
print("y_Train : \n" , y_train_robo.isna().sum())

### no NA values in Train and Test data

In [None]:
X_train_robo.describe()

In [None]:
X_test_robo.describe()

# group the rows according to series id

In [None]:
X_train_grp = X_train_robo.groupby(['series_id'], as_index=False).mean()
print(X_train_grp.shape)
X_train_grp.head()

In [None]:
## test data
test = X_test_robo.groupby(['series_id'], as_index=False).mean()
print(test.shape)
test.head()

In [None]:
train = pd.merge(X_train_grp,y_train_robo, on= ['series_id'])
train.shape

### dropping measurement_number after merging columns

In [None]:
train.drop(["measurement_number"], axis=1, inplace=True)
test.drop(["measurement_number"], axis=1, inplace=True)

In [None]:
print("train:",train.shape)
print("test:",test.shape)

### check data types of features

In [None]:
train.dtypes

In [None]:
test.dtypes

In [None]:
# plt.figure(figsize=(10,10)
train.surface.value_counts().plot(kind='bar',
                                 figsize=(10,6),
                                  color="red",
                                  alpha = 0.7,
                                  fontsize=13)
plt.title('Surface (in numbers)')
plt.xlabel('Surface')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure();
train.hist(bins=50, figsize=(20, 15))

## split data
### Divide in to train and validation


In [None]:
y=train["surface"]
X=train.drop('surface', axis=1)
X=X.drop('group_id', axis=1)

#from sklearn.model_selection import train_test_split  
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.20)  
print("X_train", X_train.shape)
print("X_validation", X_validation.shape)
print("y_train", y_train.shape)
print("y_validation", y_validation.shape)
y.unique()

### there are 9 levels in the target 

## Build a model

In [None]:
## Random Forest model

seed = 7
num_folds = 10

# Params for Random Forest
num_trees = 100
max_features = 3
models = []
models.append(('RF', RandomForestClassifier(n_estimators=num_trees, max_features=max_features)))

# evalutate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


In [None]:
# random_forest = RandomForestClassifier(n_estimators=50,max_features=5)
# random_forest.fit(X_train, y_train)
# predictions_rf = random_forest.predict(X_validation)
# print("Accuracy: %s%%" % (100*accuracy_score(y_validation, predictions_rf)))
# print('----------------')
# print(confusion_matrix(y_validation, predictions_rf))
# print('----------------')
# print(classification_report(y_validation, predictions_rf))

## predictions on test data

In [None]:
random_forest1 = RandomForestClassifier(n_estimators=50,max_features=5)
random_forest1.fit(X_train, y_train)
predictions_rf = random_forest1.predict(X_validation)
print("Accuracy: %s%%" % (100*accuracy_score(y_validation, predictions_rf)))
print('----------------')
print(confusion_matrix(y_validation, predictions_rf))
print('----------------')
print(classification_report(y_validation, predictions_rf))

In [None]:
# predication on  test data 
predictions = random_forest1.predict(test)
print(len(predictions))
sample['surface']=predictions
#sample.to_csv('samplesubmission.csv', index=False)
sample.head()