In [3]:
# https://realpython.com/python-keras-text-classification/ 

import pandas as pd

filepath_dict = {
#                 'yelp':   'data/yelp_labelled.txt',
#               'amazon': 'data/amazon_cells_labelled.txt',
#               'imdb':   'data/imdb_labelled.txt',
                'student': 'data/student.csv'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep=',') #\t
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    This is one of the beginner and best course in...
label                                                       1
source                                                student
Name: 0, dtype: object


In [4]:
# sentences = ['John likes ice cream', 'John hates chocolate.']

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'This': 151,
 'is': 644,
 'one': 792,
 'of': 783,
 'the': 1066,
 'beginner': 251,
 'and': 196,
 'best': 256,
 'course': 349,
 'in': 609,
 'computer': 319,
 'vision': 1133,
 'would': 1170,
 'like': 682,
 'to': 1087,
 'include': 610,
 'advance': 179,
 'version': 1128,
 'this': 1081,
 'as': 219,
 'well': 1144,
 'It': 76,
 'an': 195,
 'excellent': 466,
 'for': 512,
 'Phd': 113,
 'students': 1012,
 'it': 647,
 'gives': 536,
 'broad': 271,
 'different': 401,
 'research': 918,
 'areas': 215,
 'Machine': 85,
 'field': 499,
 'Presenting': 117,
 'two': 1107,
 'papers': 821,
 'mid': 726,
 'exam': 462,
 'project': 876,
 'are': 213,
 'more': 736,
 'than': 1062,
 'sufficient': 1023,
 'give': 534,
 'exposure': 483,
 'really': 897,
 'learn': 669,
 'lot': 693,
 'from': 520,
 'which': 1151,
 'will': 1157,
 'definitely': 375,
 'help': 574,
 'me': 717,
 'my': 741,
 'future': 525,
 'should': 961,
 'be': 246,
 'offered': 785,
 'regular': 904,
 'semester': 948,
 'The': 148,
 'very': 1129,
 'good': 544,
 'We

In [6]:
# vectorizer.transform(sentences).toarray()

In [12]:
from sklearn.model_selection import train_test_split

df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

In [13]:
# from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer()
# vectorizer.fit(sentences_train)

# X_train = vectorizer.transform(sentences_train)
# X_test  = vectorizer.transform(sentences_test)
# X_train
# # <750x1714 sparse matrix of type '<class 'numpy.int64'>'
# #     with 7368 stored elements in Compressed Sparse Row format>

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# classifier = LogisticRegression()
# classifier.fit(X_train, y_train)
# score = classifier.score(X_test, y_test)

# print("Accuracy:", score)

In [20]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))
    
    Naive = naive_bayes.MultinomialNB()
    Naive.fit(X_train,y_train)
    n_score = Naive.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, n_score))
   
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train,y_train)
    s_score = SVM.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, s_score))

Accuracy for student data: 0.8634
Accuracy for student data: 0.9006
Accuracy for student data: 0.8571


In [57]:
#using keras

from keras.models import Sequential
from keras import layers

input_dim = X_train.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train,
                    epochs=100,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


Using TensorFlow backend.


In [None]:

import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()