In [1]:
import pandas as pd 

data = pd.read_csv('Data/Shakespeare_data.csv')
data

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
...,...,...,...,...,...,...
111391,111392,A Winters Tale,38.0,5.3.180,LEONTES,"Lead us from hence, where we may leisurely"
111392,111393,A Winters Tale,38.0,5.3.181,LEONTES,Each one demand an answer to his part
111393,111394,A Winters Tale,38.0,5.3.182,LEONTES,Perform'd in this wide gap of time since first
111394,111395,A Winters Tale,38.0,5.3.183,LEONTES,We were dissever'd: hastily lead away.


First I changed all 'Play' and 'Player' from name to numbers, so that each play and player will have different number

In [2]:
# change play and player from string to numbers
data['Play'] = data['Play'].astype('category') # check how many different play name. 
data['Play']

0               Henry IV
1               Henry IV
2               Henry IV
3               Henry IV
4               Henry IV
               ...      
111391    A Winters Tale
111392    A Winters Tale
111393    A Winters Tale
111394    A Winters Tale
111395    A Winters Tale
Name: Play, Length: 111396, dtype: category
Categories (36, object): [A Comedy of Errors, A Midsummer nights dream, A Winters Tale, Alls well that ends well, ..., Troilus and Cressida, Twelfth Night, Two Gentlemen of Verona, macbeth]

In [3]:
data['Play'] = data['Play'].cat.rename_categories(list(range(1,37))).astype('int')

In [4]:
data['Player'] = data['Player'].astype('category') # check how many different player name
data['Player']

0                   NaN
1                   NaN
2                   NaN
3         KING HENRY IV
4         KING HENRY IV
              ...      
111391          LEONTES
111392          LEONTES
111393          LEONTES
111394          LEONTES
111395          LEONTES
Name: Player, Length: 111396, dtype: category
Categories (934, object): [A Lord, A Patrician, A Player, AARON, ..., of BUCKINGHAM, of King Henry VI, of Prince Edward, of young Princes]

In [5]:
data['Player'] = data['Player'].cat.rename_categories(list(range(1,935))).astype('int')
data = data.dropna() # drop all Nan value
data

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
3,4,10,1.0,1.1.1,458,"So shaken as we are, so wan with care,"
4,5,10,1.0,1.1.2,458,"Find we a time for frighted peace to pant,"
5,6,10,1.0,1.1.3,458,And breathe short-winded accents of new broils
6,7,10,1.0,1.1.4,458,To be commenced in strands afar remote.
7,8,10,1.0,1.1.5,458,No more the thirsty entrance of this soil
...,...,...,...,...,...,...
111390,111391,3,38.0,5.3.179,495,"Is troth-plight to your daughter. Good Paulina,"
111391,111392,3,38.0,5.3.180,495,"Lead us from hence, where we may leisurely"
111392,111393,3,38.0,5.3.181,495,Each one demand an answer to his part
111393,111394,3,38.0,5.3.182,495,Perform'd in this wide gap of time since first


Since the feature ActSceneLine is in string and it can be used for the classification, so to make it easier to use, I split it into three columns: Act, Scene, and Line, these three columns are in int type. 

In [6]:
splitASL = data['ActSceneLine'].astype('str').str.split(pat='.',expand=True)

In [7]:
splitASL = splitASL.rename(columns={0:'Act',1:'Scene',2:'Line'})
splitASL

Unnamed: 0,Act,Scene,Line
3,1,1,1
4,1,1,2
5,1,1,3
6,1,1,4
7,1,1,5
...,...,...,...
111390,5,3,179
111391,5,3,180
111392,5,3,181
111393,5,3,182


In [8]:
data = pd.concat([data, splitASL], axis=1, sort=False)

Then I removed some unused features. I was tried to explore the column "PlayerLine" to get more features for classification, then I realized that it will make much more work to count the words, or some other steps, so I decide to not use this column.  

In [9]:
# remove some unused features. 
data = data.drop(['Dataline', 'ActSceneLine', 'PlayerLine'], axis=1)
data.to_csv('Data/new_datasets.csv', index=False)
data

Unnamed: 0,Play,PlayerLinenumber,Player,Act,Scene,Line
3,10,1.0,458,1,1,1
4,10,1.0,458,1,1,2
5,10,1.0,458,1,1,3
6,10,1.0,458,1,1,4
7,10,1.0,458,1,1,5
...,...,...,...,...,...,...
111390,3,38.0,495,5,3,179
111391,3,38.0,495,5,3,180
111392,3,38.0,495,5,3,181
111393,3,38.0,495,5,3,182


I then split the data into training data and testing data with seperate sample and labels, where label is player, and sample is all other features. 

In [10]:
from sklearn.model_selection import train_test_split
label = data['Player']
sample = data.drop('Player',axis=1)
sample_train, sample_test, label_train, label_test = train_test_split(sample, label, test_size=0.1)

I first tried with Gaussian Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB

model_1 = GaussianNB()
model_1.fit(sample_train,label_train)
label_predict_1 = model_1.predict(sample_test)
accuracy_1 = model_1.score(sample_test,label_test)
accuracy_1

0.2222327881323697

As the result, the Gaussian Naive Bayes runs very fast but it gives a very bad prediction with low accuracy, so then I tried with Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier

model_2 = RandomForestClassifier(n_estimators=10)
model_2.fit(sample_train,label_train)
label_predict_2 = model_2.predict(sample_test)
accuracy_2 = model_2.score(sample_test,label_test)
accuracy_2

0.8132369722327881

As the result shows that this classifier is as good as 80% accurcy, so it is much better than Naive Bayes,
I also tried with Decision Tree. 

In [13]:
from sklearn.tree import DecisionTreeClassifier

model_3 = DecisionTreeClassifier()
model_3.fit(sample_train,label_train)
label_predict_3 = model_3.predict(sample_test)
accuracy_3 = model_3.score(sample_test,label_test)
accuracy_3

0.7937428680106504

The result of Decision Tree is also close to 80% accuracy, but not as good as Random Forest, I then tried to change some features and datasets, to see if I can get better result by using Random Forest

My previous test are using 10% test sample and 90% training sample, that may effect the prediction result since testing sample is small, so I tried with 20% testing sample

In [14]:
sample_train_2, sample_test_2, label_train_2, label_test_2 = train_test_split(sample, label, test_size=0.2)

In [15]:
model_4 = RandomForestClassifier(n_estimators=10)
model_4.fit(sample_train_2,label_train_2)
label_predict_4 = model_4.predict(sample_test_2)
accuracy_4 = model_4.score(sample_test_2,label_test_2)
accuracy_4

0.80029480290999

The result with more testing data is decreased little bit from 10% testing data, then I tried to make the randome forest model with more randome trees. 

In [16]:
model_5 = RandomForestClassifier(n_estimators=20)
model_5.fit(sample_train,label_train)
label_predict_5 = model_5.predict(sample_test)
accuracy_5 = model_5.score(sample_test,label_test)
accuracy_5

0.8252187143400532

The model with more randome trees is taking longer time to prediction, but it comes with better accuracy. So that means if the model is more complex, with large training datasets, we may have better results. 

I then want see which features will effect the prediction result, so I trained some datasets with less features.

In [17]:
sample_less1 = data.drop(['Player','PlayerLinenumber'],axis=1)
sample_train_less1, sample_test_less1, label_train_less1, label_test_less1 = train_test_split(sample_less1, label, test_size=0.1)
model_6 = RandomForestClassifier(n_estimators=10)
model_6.fit(sample_train_less1,label_train_less1)
label_predict_6 = model_6.predict(sample_test_less1)
accuracy_6 = model_6.score(sample_test_less1,label_test_less1)
accuracy_6

0.7170026626093572

In [18]:
sample_less2 = data.drop(['Player','Act'],axis=1)
sample_train_less2, sample_test_less2, label_train_less2, label_test_less2 = train_test_split(sample_less2, label, test_size=0.1)
model_7 = RandomForestClassifier(n_estimators=10)
model_7.fit(sample_train_less2,label_train_less2)
label_predict_7 = model_7.predict(sample_test_less2)
accuracy_7 = model_7.score(sample_test_less2,label_test_less2)
accuracy_7

0.7339292506656523

In [19]:
sample_less3 = data.drop(['Player','Scene'],axis=1)
sample_train_less3, sample_test_less3, label_train_less3, label_test_less3 = train_test_split(sample_less3, label, test_size=0.1)
model_8 = RandomForestClassifier(n_estimators=10)
model_8.fit(sample_train_less3,label_train_less3)
label_predict_8 = model_8.predict(sample_test_less3)
accuracy_8 = model_8.score(sample_test_less3,label_test_less3)
accuracy_8

0.7303157093952073

In [20]:
sample_less4 = data.drop(['Player','Line'],axis=1)
sample_train_less4, sample_test_less4, label_train_less4, label_test_less4 = train_test_split(sample_less4, label, test_size=0.1)
model_9 = RandomForestClassifier(n_estimators=10)
model_9.fit(sample_train_less4,label_train_less4)
label_predict_9 = model_9.predict(sample_test_less4)
accuracy_9 = model_9.score(sample_test_less4,label_test_less4)
accuracy_9

0.8436667934575884

The result is very interesting that the model will have higher prediction accuracy without the feature 'Line', so that the Line feature is the fact that can effect the model prediction. 