In [2]:

import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [3]:
message_data = pd.read_csv("Content.csv",encoding = "latin")
message_data.head()

Unnamed: 0.1,Unnamed: 0,Content,Pages,Unit
0,8,"['1', 'introduct', 'human', 'be', 'curiou', 't...",1,1
1,9,"['210th', 'standard', 'scienc', 'mechan', 'bra...",2,1
2,10,"['3', 'law', 'motion', '1', '4', 'newton', 's'...",3,1
3,11,"['410th', 'standard', 'scienc', 'if', 'result'...",4,1
4,12,"['5', 'law', 'motion', 'the', 'rod', 'turn', '...",5,1


In [4]:
message_data = message_data.drop(['Unnamed: 0'], axis = 1)

In [5]:
message_data = message_data.rename(columns = {'Content':'content','Unit':'unit'})

In [6]:
message_data.groupby('unit').describe()

Unnamed: 0_level_0,Pages,Pages,Pages,Pages,Pages,Pages,Pages,Pages
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
unit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,15.0,8.0,4.472136,1.0,4.5,8.0,11.5,15.0
2,16.0,23.5,4.760952,16.0,19.75,23.5,27.25,31.0
3,10.0,36.5,3.02765,32.0,34.25,36.5,38.75,41.0
4,17.0,50.0,5.049752,42.0,46.0,50.0,54.0,58.0
5,15.0,66.0,4.472136,59.0,62.5,66.0,69.5,73.0
6,17.0,82.0,5.049752,74.0,78.0,82.0,86.0,90.0
7,15.0,98.0,4.472136,91.0,94.5,98.0,101.5,105.0
8,18.0,114.5,5.338539,106.0,110.25,114.5,118.75,123.0
9,13.0,130.0,3.89444,124.0,127.0,130.0,133.0,136.0
10,18.0,145.5,5.338539,137.0,141.25,145.5,149.75,154.0


In [7]:
message_data_copy = message_data['content'].copy()

In [8]:

def text_preprocess(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return " ".join(text)

In [9]:
message_data_copy = message_data_copy.apply(text_preprocess)

In [10]:
message_data_copy

0      1 introduct human curiou thing around thing ar...
1      210th standard scienc mechan branch physic dea...
2      3 law motion 1 4 newton law motion 1 4 1 newto...
3      410th standard scienc result forc forc act bod...
4      5 law motion rod turn fix point call point rot...
5      610th standard scienc equilibrium algebra sum ...
6      7 law motion unit forc amount forc requir prod...
7      810th standard scienc fill fuel either liquid ...
8      9 law motion let m1 m2 mass two bodi b place r...
9      1010th standard scienc 1 10 5 variat acceler d...
10     11 law motion tabl 1 2 appar weight person mov...
11     1210th standard scienc 1 12 3 applic newton la...
12     13 law motion choos correct answer 1 inertia b...
13     1410th standard scienc iv match follow column ...
14     15 law motion ix hot question 1 two block mass...
15     1610th standard scienc introduct light form en...
16     17 optic 6 differ colour light differ waveleng...
17     1810th standard scienc 2

In [11]:
vectorizer = TfidfVectorizer("english")

In [12]:
message_mat = vectorizer.fit_transform(message_data_copy)
message_mat

<333x7462 sparse matrix of type '<class 'numpy.float64'>'
	with 45110 stored elements in Compressed Sparse Row format>

In [24]:

message_train, message_test, unit_train, unit_test = train_test_split(message_mat, 
                                                        message_data['unit'], test_size=0.2, random_state=20)

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

unit_model = LogisticRegression(solver='liblinear', penalty='l1')
unit_model.fit(message_train, unit_train)
pred = unit_model.predict(message_test)
accuracy_score(unit_test,pred)

0.8059701492537313

In [26]:

def stemmer (text):
    text = text.split()
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    return words

In [27]:

message_data_copy = message_data_copy.apply(stemmer)
vectorizer = TfidfVectorizer("english")
message_mat = vectorizer.fit_transform(message_data_copy)

In [30]:

message_train, message_test, unit_train, unit_test = train_test_split(message_mat, 
                                                        message_data['unit'], test_size=0.2, random_state=20)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

unit_model = LogisticRegression(solver='liblinear', penalty='l1')
unit_model.fit(message_train, unit_train)
pred = unit_model.predict(message_test)
accuracy_score(unit_test,pred)

0.8059701492537313

In [32]:
message_data['length'] = message_data['content'].apply(len)
message_data.head()

Unnamed: 0,content,Pages,unit,length
0,"['1', 'introduct', 'human', 'be', 'curiou', 't...",1,1,2012
1,"['210th', 'standard', 'scienc', 'mechan', 'bra...",2,1,2927
2,"['3', 'law', 'motion', '1', '4', 'newton', 's'...",3,1,2686
3,"['410th', 'standard', 'scienc', 'if', 'result'...",4,1,2024
4,"['5', 'law', 'motion', 'the', 'rod', 'turn', '...",5,1,2755


In [33]:

length = message_data['length'].as_matrix()
new_mat = np.hstack((message_mat.todense(),length[:, None]))

  """Entry point for launching an IPython kernel.


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

unit_model = LogisticRegression(solver='liblinear', penalty='l1')
unit_model.fit(message_train, unit_train)
pred = unit_model.predict(message_test)
accuracy_score(unit_test,pred)

0.8059701492537313