### Classification using scikit-learn (with pandas)

In [None]:
!wget https://transfer.sh/Mos5q/Cities.csv

In [3]:
import csv
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [6]:
# Read Cities.csv into dataframe, add column for temperature category
# Note: For a dataframe D and integer i, D.ix[i] is the i-th row of D
f = open('Cities.csv','rU')
cities = pd.read_csv(f)
cats = []
for i in range(len(cities)):
    if cities.ix[i]['temperature'] < 5:
        cats.append('cold')
    elif cities.ix[i]['temperature'] < 9:
        cats.append('cool')
    elif cities.ix[i]['temperature'] < 15:
        cats.append('warm')
    else: cats.append('hot')
cities['category'] = cats
print("cold:", len(cities[(cities.category == 'cold')]))
print("cool:", len(cities[(cities.category == 'cool')]))
print("warm:", len(cities[(cities.category == 'warm')]))
print("hot:", len(cities[(cities.category == 'hot')]))

  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()


cold: 17
cool: 92
warm: 79
hot: 25


In [7]:
# Create training and test sets for cities data
numitems = len(cities)
percenttrain = 0.85
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print('Training set', numtrain, 'items')
print('Test set', numtest, 'items')
citiesTrain = cities[0:numtrain]
citiesTest = cities[numtrain:]

Training set 181 items
Test set 32 items


### K-nearest-neighbors classification

In [8]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
neighbors = 8
classifier = KNeighborsClassifier(neighbors)
classifier.fit(citiesTrain[features], citiesTrain['category'])
predictions = classifier.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
    print('Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category'])
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print('Accuracy:', float(correct)/float(numtest))
# Comment out print, try other values for neighbors, other features

Predicted: warm  Actual: cool
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: warm
Predicted: warm  Actual: warm
Predicted: cool  Actual: cold
Predicted: cold  Actual: cold
Predicted: warm  Actual: warm
Predicted: cool  Actual: cold
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: warm  Actual: warm
Predicted: hot  Actual: hot
Predicted: cold  Actual: cold
Predicted: cool  Actual: cold
Predicted: cool  Actual: cold
Predicted: hot  Actual: hot
Predicted: cool  Actual: cool
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: cool
Predicted: cool  Actual: warm
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Predicted: warm  Actual: warm
Predicted: cool  Actual: cool
Accuracy: 0.78125


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


### <font color="green">Your Turn: K-nearest-neighbors on World Cup Data</font>

In [None]:
!wget https://transfer.sh/CfbJS/Players.csv

In [9]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# This cell does all the set-up, including reordering the data to avoid team bias.
f = open('Players.csv','rU')
players = pd.read_csv(f)
players = players.sort_values(by='surname')
players = players.reset_index(drop=True)
numitems = len(players)
percenttrain = 0.95
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print('Training set', numtrain, 'items')
print('Test set', numtest, 'items')
playersTrain = players[0:numtrain]
playersTest = players[numtrain:]

Training set 565 items
Test set 30 items


  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
# This cell does the classification.
# Try different features and different numbers of neighbors.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
neighbors = 10
classifier = KNeighborsClassifier(neighbors)
classifier.fit(playersTrain[features], playersTrain['position'])
predictions = classifier.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', playersTest.ix[numtrain+i]['position']
    if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
print('Accuracy:', float(correct)/float(numtest))

Accuracy: 0.5666666666666667


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


## Decision tree classification

In [11]:
# Predict temperature category from other features
features = ['longitude','latitude']
split = 10
dt = DecisionTreeClassifier(min_samples_split=split) # parameter is optional
dt.fit(citiesTrain[features],citiesTrain['category'])
predictions = dt.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print('Accuracy:', float(correct)/float(numtest))
# Try other values for split, other features

Accuracy: 0.6875


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


### "Forest" of decision trees

In [12]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
trees = 10
rf = RandomForestClassifier(n_estimators=trees)
rf.fit(citiesTrain[features],citiesTrain['category'])
predictions = rf.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print('Accuracy:', float(correct)/float(numtest))
# Try other values for trees

Accuracy: 0.71875


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  del sys.path[0]


### <font color="green">Your Turn: Decision tree and forest of trees on World Cup Data</font>

In [13]:
# SINGLE TREE
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features and different values for min_samples_split.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
split = 10
dt = DecisionTreeClassifier(min_samples_split=split) # parameter is optional
dt.fit(playersTrain[features],playersTrain['position'])
predictions = dt.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', playersTest.ix[numtrain+i]['position']
    if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
print('Accuracy:', float(correct)/float(numtest))

Accuracy: 0.4666666666666667


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()


In [14]:
# FOREST OF TREES
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different values for n_estimators.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
trees = 10
rf = RandomForestClassifier(n_estimators=trees)
rf.fit(playersTrain[features],playersTrain['position'])
predictions = rf.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', playersTest.ix[numtrain+i]['position']
    if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
print('Accuracy:', float(correct)/float(numtest))

Accuracy: 0.5666666666666667


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()


### Naive Bayes classification

In [15]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
nb = GaussianNB()
nb.fit(citiesTrain[features],citiesTrain['category'])
predictions = nb.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print('Accuracy:', float(correct)/float(numtest))
# Try other features

Accuracy: 0.78125


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':


### <font color="green">Your Turn: Naive Bayes on World Cup Data</font>

In [None]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features. What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
nb = GaussianNB()
nb.fit(playersTrain[features],playersTrain['position'])
predictions = nb.predict(playersTest[features])
# Calculate accuracy
numtrain = len(playersTrain)
numtest = len(playersTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', playersTest.ix[numtrain+i]['position']
    if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
print('Accuracy:', float(correct)/float(numtest))