In [78]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [79]:
# reading the file 
df = pd.read_csv(r"06 - Updated.csv")
df.head()

Unnamed: 0,label,sub_mssg
0,0,job post apple research center content length ...
1,0,query letter for text identification I be post...
2,0,risk a colleague and I be the of risk by our h...
3,0,request book information this morning I be on ...
4,0,call for in syntactic theory content length ca...


In [80]:
# printing the shape of the dataframe
df.shape

(4196, 2)

In [81]:
# storing the sub and message clumn in a variable named corpus
corpus = df['sub_mssg'].values.astype('U')

In [82]:
# definning the count vectorizer
vectorizer = CountVectorizer()
# fitting the corpus and transforming it to vector
X = vectorizer.fit_transform(corpus)

In [83]:
# Corpus after count vectorizer
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [84]:
# getting the shape of the count vectorizer
l = X.toarray()
l.shape

(4196, 21740)

In [85]:
# now definning the Tf Idf vectorizer
vectorizer = TfidfVectorizer(max_features=2000)
X = vectorizer.fit_transform(corpus)

In [86]:
type(X)

scipy.sparse.csr.csr_matrix

In [87]:
# getting the feature names
vectorizer.get_feature_names()[144]



'assistant'

In [88]:
X = X.toarray()

In [89]:
type(X)

numpy.ndarray

In [90]:
# getting the shape of the output 
X.shape

(4196, 2000)

In [91]:
X.dtype

dtype('float64')

In [93]:
temp = df['label'].array
temp

<PandasArray>
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 ...
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Length: 4196, dtype: int64

In [94]:
temp = np.array(temp)
temp

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [95]:
temp.shape

(4196,)

In [96]:
temp = np.reshape(temp,(4196,1))

In [97]:
temp.shape

(4196, 1)

In [98]:
temp

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]], dtype=int64)

In [99]:
temp = np.hstack([X,temp])

In [100]:
temp.shape

(4196, 2001)

In [101]:
temp = temp[:-1]
temp.shape

(4195, 2001)

In [102]:
temp = np.array(temp,dtype=float)
temp

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.10089788, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.09047175, 0.        , 0.04417766, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [103]:
temp = temp.round(3)

In [104]:
temp = temp.transpose()
temp.shape

(2001, 4195)

In [76]:
# now coverting the output of tf idf to a csv file
np.savetxt('train.csv',temp,delimiter=',')

In [77]:
df = pd.read_csv(r"TF IDF.csv")
df.head()

Unnamed: 0,0.000000000000000000e+00,0.000000000000000000e+00.1,0.000000000000000000e+00.2,0.000000000000000000e+00.3,0.000000000000000000e+00.4,0.000000000000000000e+00.5,0.000000000000000000e+00.6,0.000000000000000000e+00.7,0.000000000000000000e+00.8,0.000000000000000000e+00.9,...,0.000000000000000000e+00.1883,0.000000000000000000e+00.1884,0.000000000000000000e+00.1885,0.000000000000000000e+00.1886,0.000000000000000000e+00.1887,0.000000000000000000e+00.1888,0.000000000000000000e+00.1889,0.000000000000000000e+00.1890,0.000000000000000000e+00.1891,0.000000000000000000e+00.1892
0,0.0,0.0,0.0,0.066,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.038,0.0,0.032,0.0,0.034,0.0,0.0,0.0,0.066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.shape

In [None]:
df.iloc[-1].unique()

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = \
    train_test_split(X, df['label'], test_size=0.3, random_state=5)

In [None]:
x_train.shape

In [None]:
x_test.shape

## Naive Bayes classifier

In [None]:
#converting sparse matrix to matrix
x_train = x_train.A
x_test = x_test.A

In [None]:
model = MultinomialNB().fit(x_train,y_train)

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:
accuracy_score(y_test,y_pred)