# Basic Prediction cheated by mixing the past with the future.  In this example we predict the winners of district matches using the regionals.

Load the training data into a pandas data frame.  

In [34]:
import pandas as pd

train_auburn = pd.read_csv('../event_features_2019waahs.txt', sep='\t', header=None, names=['Label','RedAlliance','BlueAlliance'])
train_glacier = pd.read_csv('../event_features_2019wasno.txt', sep='\t', header=None, names=['Label','RedAlliance','BlueAlliance'])

test_districts = pd.read_csv('../event_features_2019pncmp.txt', sep='\t', header=None, names=['Label','RedAlliance','BlueAlliance'])


Preview the first 10 rows

In [35]:
train_auburn[0:10]

Unnamed: 0,Label,RedAlliance,BlueAlliance
0,1,frc2990 frc2046 frc4579,frc4911 frc4089 frc6503
1,1,frc2990 frc2046 frc4579,frc4911 frc4089 frc2929
2,1,frc2990 frc2046 frc4579,frc3049 frc5937 frc3219
3,1,frc2990 frc2046 frc4579,frc3049 frc5937 frc3219
4,1,frc2907 frc1318 frc2926,frc360 frc3574 frc3876
5,0,frc2907 frc1318 frc2926,frc360 frc3574 frc3876
6,0,frc2907 frc1318 frc2926,frc360 frc3574 frc3876
7,0,frc492 frc4918 frc7118,frc948 frc4131 frc3070
8,0,frc492 frc4918 frc7118,frc948 frc4131 frc3070
9,1,frc4911 frc4089 frc6503,frc2097 frc6350 frc2927


We can leverage some concepts from https://stackabuse.com/text-classification-with-python-and-scikit-learn/ to build our model.  The basic idea is to use the team names as features.  Suppose frc492 is a really strong team- when it appears in the RedAlliance column it will add some weight to the probability that Red wins, and vice-versa if it appear in the BlueAlliance column.  So we want to build a predictor that figures out how much it matters when frc492 appears in a column (and the same for any other team).

In [36]:
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer

# two count vectorizers. This transforms the alliance lists into vector encodings
redVectorizer = CountVectorizer(max_features=1500, min_df=1, max_df=1.0, stop_words=None)  
blueVectorizer = CountVectorizer(max_features=1500, min_df=1, max_df=1.0, stop_words=None)  

ct = ColumnTransformer([('RedFeatures',redVectorizer,'RedAlliance'), ('BlueFeatures',blueVectorizer,'BlueAlliance')])

# shuffle the data first
train = pd.concat([train_auburn,train_glacier])


#train = train.sample(frac=1.0)

# produce the training features and labels.
X = ct.fit_transform(train)
y = train.Label

Xtest = ct.transform(test_districts)
Ytest = test_districts.Label

#print(X.size, Xtest.size)
X[0]

<1x134 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

We have the data in a state where we can start to build models. First we'll try a basic random forest with 100 trees.

In [37]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=0, min_samples_split=3)  
classifier

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [38]:
# Run four-fold cross-validation
classifier.fit(X,y)
predictions= classifier.predict(Xtest)
import numpy as np
from sklearn.metrics import accuracy_score
#np.sum(np.abs(scores-Ytest))
accuracy_score(Ytest, predictions)

0.6223776223776224

## Let's also try logistic regression.

In [40]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
classifier.fit(X,y)
predictions= classifier.predict(Xtest)
import numpy as np
from sklearn.metrics import accuracy_score
#np.sum(np.abs(scores-Ytest))
accuracy_score(Ytest, predictions)

0.6573426573426573

## Linear Regression

In [41]:
from sklearn.linear_model import RidgeClassifier
classifier = RidgeClassifier()
classifier.fit(X,y)
predictions= classifier.predict(Xtest)
import numpy as np
from sklearn.metrics import accuracy_score
#np.sum(np.abs(scores-Ytest))
accuracy_score(Ytest, predictions)

0.6363636363636364