In [1]:
import pandas as pd
import numpy
import os
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [38]:
import pickle

In [2]:
def process(path_to_folder):
    train = []
    for root, dirs, files in os.walk(path_to_folder):
        for file in files:
            if file.endswith('.txt'):
                with open(os.path.join(root, file), 'r') as f:
                    text = f.read()
                    temp = text.split(';\n')
                    final = []
                    for i in range (len(temp)):
                        a = temp[i].split(',')
                        final.append(a)
                
                    train = train[:] + final
                    
    return train
    

In [3]:
trainphoneaccel = process('train/phone/accel')
trainphonegyro = process('train/phone/gyro')
trainwatchaccel = process('train/watch/accel')
trainwatchgyro = process('train/watch/gyro')

In [4]:
trainphoneaccel[:10]

[['1600', 'A', '252207666810782', '-0.36476135', '8.793503', '1.0550842'],
 ['1600', 'A', '252207717164786', '-0.8797302', '9.768784', '1.0169983'],
 ['1600', 'A', '252207767518790', '2.0014954', '11.10907', '2.619156'],
 ['1600', 'A', '252207817872794', '0.45062256', '12.651642', '0.18455505'],
 ['1600', 'A', '252207868226798', '-2.1643524', '13.928436', '-4.4224854'],
 ['1600', 'A', '252207918580802', '-4.332779', '13.361191', '-0.7188721'],
 ['1600', 'A', '252207968934806', '-0.31944275', '13.318359', '-0.23202515'],
 ['1600', 'A', '252208019288809', '1.566452', '9.515274', '-0.01777649'],
 ['1600', 'A', '252208069642813', '-0.32374573', '5.262665', '0.32234192'],
 ['1600', 'A', '252208119996817', '-1.811676', '3.7105103', '1.3739319']]

In [5]:
train = trainphoneaccel + trainphonegyro + trainwatchaccel + trainwatchgyro

In [6]:
len(train)

5575090

In [7]:
def transform(data):
    data = data[:-1]
    data = pd.DataFrame(data, columns = ['Subject-id', 'Activity Label', 'Timestamp', 'x', 'y', 'z'])
    return data

In [8]:
train = transform(train)

In [9]:
train.shape

(5575089, 6)

In [10]:
train = train.convert_objects(convert_numeric=True)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """Entry point for launching an IPython kernel.


In [11]:
train.head()

Unnamed: 0,Subject-id,Activity Label,Timestamp,x,y,z
0,1600.0,A,252207700000000.0,-0.364761,8.793503,1.055084
1,1600.0,A,252207700000000.0,-0.87973,9.768784,1.016998
2,1600.0,A,252207800000000.0,2.001495,11.10907,2.619156
3,1600.0,A,252207800000000.0,0.450623,12.651642,0.184555
4,1600.0,A,252207900000000.0,-2.164352,13.928436,-4.422485


In [12]:
train = train.dropna(subset = ['Subject-id','Timestamp', 'Activity Label','x', 'y', 'z'])

In [13]:
label = train['Activity Label'].unique()

In [14]:
l={}
n=0
for i in label:
    l[i] = n+1
    n+=1

train['Activity Label'] = train['Activity Label'].apply(lambda x: l[x])

In [15]:
train.head()

Unnamed: 0,Subject-id,Activity Label,Timestamp,x,y,z
0,1600.0,1,252207700000000.0,-0.364761,8.793503,1.055084
1,1600.0,1,252207700000000.0,-0.87973,9.768784,1.016998
2,1600.0,1,252207800000000.0,2.001495,11.10907,2.619156
3,1600.0,1,252207800000000.0,0.450623,12.651642,0.184555
4,1600.0,1,252207900000000.0,-2.164352,13.928436,-4.422485


In [16]:
testphoneaccel = process('test/phone/accel')
testphonegyro = process('test/phone/gyro')
testwatchaccel = process('test/watch/accel')
testwatchgyro = process('test/watch/gyro')

In [17]:
test = testphoneaccel + testphonegyro + testwatchaccel + testwatchgyro

In [18]:
test = transform(test)

In [19]:
test = test.convert_objects(convert_numeric=True)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """Entry point for launching an IPython kernel.


In [20]:
test = test.dropna(subset = ['Subject-id','Timestamp', 'Activity Label','x', 'y', 'z'])

In [21]:
l={}
n=0
for i in label:
    l[i] = n+1
    n+=1

test['Activity Label'] = test['Activity Label'].apply(lambda x: l[x])

In [22]:
train['Timestamp'] = train['Timestamp'].apply(lambda x: x//1000000)
train['Timestamp'] = train['Timestamp'].apply(lambda x: datetime.fromtimestamp(x))
test['Timestamp'] = test['Timestamp'].apply(lambda x: x//1000000)
test['Timestamp'] = test['Timestamp'].apply(lambda x: datetime.fromtimestamp(x))
train.drop(columns="Subject-id",inplace=True)
test.drop(columns="Subject-id",inplace=True)

In [23]:
for time in ('year','month','week','day','hour','minute','second'):
    train[time] = getattr(train['Timestamp'].dt,time)
train.drop(columns="Timestamp",inplace=True)

for time in ('year','month','week','day','hour','minute','second'):
    test[time] = getattr(test['Timestamp'].dt,time)
test.drop(columns="Timestamp",inplace=True)

In [24]:
train.head()

Unnamed: 0,Activity Label,x,y,z,year,month,week,day,hour,minute,second
0,1,-0.364761,8.793503,1.055084,1977,12,52,29,7,11,6
1,1,-0.87973,9.768784,1.016998,1977,12,52,29,7,11,57
2,1,2.001495,11.10907,2.619156,1977,12,52,29,7,12,47
3,1,0.450623,12.651642,0.184555,1977,12,52,29,7,13,37
4,1,-2.164352,13.928436,-4.422485,1977,12,52,29,7,14,28


In [25]:
train = train.sample(frac=1).reset_index(drop=True)
test = test.sample(frac=1).reset_index(drop=True)
data=pd.DataFrame()
data=pd.concat([train,test])

In [26]:
y=data["Activity Label"]
x=data.drop(columns="Activity Label")
x_train, x_test, y_train, y_test = train_test_split(x,y , train_size = 0.7, random_state =  42)



In [27]:
model = RandomForestClassifier()
model.fit(x_train, y_train)




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
y= model.predict(x_test)
acc = accuracy_score(y_test, y)

In [29]:
acc

0.9698770797889041

In [39]:
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [40]:
df = pd.DataFrame(y)

In [42]:
df.to_csv('answer.csv')