In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_profiling as pp

import re
import nltk.corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import digits

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import itertools
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learn-ai-bbc/BBC News Train.csv
/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv
/kaggle/input/learn-ai-bbc/BBC News Test.csv
/kaggle/input/moviesreal/movies.csv
/kaggle/input/moviesreal/users.csv
/kaggle/input/moviesreal/train.csv
/kaggle/input/moviesreal/test.csv


# Part 1. EDA and Model Creation and Comparison with Supervised Learning

In this part I will do some simple EDA and feature creation to prepare the data for the models.

I will then create two multiple models to try and get an accurate outcome.

# 1.1 EDA

In [2]:
train = pd.read_csv('../input/learn-ai-bbc/BBC News Train.csv')

Starting the EDA with an investigation of the untokenized data.
The following observations are important:
1. There is no missing data
2. 96.6% of the articles are unique
3. The article category prevelance is as follows - Sports {346}, Business {336}, Politics {274}, Entertainment {273}, Tech {261}

In [3]:
profile = pp.ProfileReport(train, title="Pandas Profiling Report", explorative=True)
profile.to_notebook_iframe()
profile.to_file("first_profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
stop_words = stopwords.words('english')
train['Text'] = train['Text'].apply(lambda row: re.sub(r'[^\w\s]+', '', row))
train['Text'] = train['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
train['Text'] = train['Text'].apply(lambda x: re.sub(' +', ' ', x))

X_train, X_valid, y_train, y_valid = train_test_split(train['Text'], train['Category'], test_size=0.33, random_state=42)

Next we run Pandas Profiling again on the dataset to discover any new insights from tokenizing the document.

We find that the average document has 220 words. The maximum amount of words is 1698 and the minimum is 48.

In [5]:
dataframe = pd.DataFrame()
dataframe['tokenized'] = train.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)
dataframe['num_words'] = dataframe['tokenized'].apply(lambda lst: len(lst))

profile = pp.ProfileReport(dataframe, title="Pandas Profiling Report", explorative=True)
profile.to_notebook_iframe()
profile.to_file("second_profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Next we create a vector of words and the number of times they appear in a text.

We transform this into an array and return a training and validation set set for supervised learning.

In [6]:
def count_vectorization(n_features):
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words="english", lowercase = True)
    tf = tf_vectorizer.fit_transform(train['Text'])
    count_tokens = tf_vectorizer.get_feature_names_out()
    train_countvect = pd.DataFrame(data = tf.toarray(),columns = count_tokens)
    return train_test_split(train_countvect, train['Category'], test_size=0.33, random_state=42)

Next we calculate the tfidf weights for the corpus of documents

We transform this into an array and return a training and validation set for supervised learning.

In [7]:
def tfidf_vectorization(n_features):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=n_features,stop_words="english", lowercase = True)
    tf = tfidf_vectorizer.fit_transform(train['Text'])
    tokens = tfidf_vectorizer.get_feature_names_out()
    train_vect = pd.DataFrame(data = tf.toarray(),columns = tokens)
    return train_test_split(train_vect, train['Category'], test_size=0.33, random_state=42)

The below code runs ETL on the vector of words. However, because of the number of words it is unweildy and can only be run for a few words at a time. 

The inspiration was to test correlation to the category for each word and to see which words had high correlation to each other.

In [8]:
#n_features = 5
#X_train_s, X_valid_s, y_train_s, y_valid_s = count_vectorization(n_features)
#supervisedLearningData = pd.concat([X_train_s, y_train_s], axis="columns")
#profile = pp.ProfileReport(df_countvect, title="Pandas Profiling Report", explorative=True)
#profile.to_file("third_profile.html")

Next we calculate the tfidf weights for the corpus of documents

This is the dataset we use for the nmf unsupervised learning model

In [9]:
def tfidf(X_train, X_valid):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english", lowercase = True)
    tfidf_vectorizer.fit(X_train)
    tfidf_train = tfidf_vectorizer.transform(X_train)
    tfidf_valid = tfidf_vectorizer.transform(X_valid)
    tokens = tfidf_vectorizer.get_feature_names_out()
    train_vect = pd.DataFrame(data = tfidf_train.toarray(),columns = tokens)
    valid_vect = pd.DataFrame(data = tfidf_valid.toarray(),columns = tokens)
    return tfidf_train, tfidf_valid, train_vect, valid_vect


# 1.2 MODEL

Here we initiate the model and its params

In [10]:
def model(mtype, stype):
    nmf = NMF(
        init = 'nndsvdar',
        n_components=5,
        solver = stype,
        random_state=1,
        beta_loss=mtype,
        alpha_W=0.00005,
        alpha_H=0.00005,
        l1_ratio=.1,
    )
    return nmf

Here is a line of code to predict the outcome.

In [11]:
def predict(matrix):
    sortedMatrix = np.argsort(matrix)
    n_predictions, maxValue = sortedMatrix.shape
    
    predictions = [[sortedMatrix[i][maxValue - 1]] for i in range(n_predictions)]
    topics = np.empty(n_predictions, dtype = np.int64)
    
    for i in range(n_predictions):
        topics[i] = predictions[i][0]
    return topics

This is a function to identify the accuracy based on the best permutation of categories. 

In [12]:
def label_permute(ytdf,yp,perm_list, n=5):
    """
    ytdf: labels dataframe object
    yp: clustering label prediction output
    Returns permuted label order and accuracy. 
    Example output: (3, 4, 1, 2, 0), 0.74 
    """
    # your code here
    unique = np.unique(ytdf)
    perm = itertools.permutations(perm_list)
    accuracyMatrix = []
    for i in list(perm):
        j = 0
        ytdf_guess = ytdf
        for k in unique:
            ytdf_guess = np.where(ytdf_guess == k, i[j], ytdf_guess)
            j+=1
        pred = ytdf_guess.tolist()

        accuracy = accuracy_score(pred,yp)
        accuracyMatrix.append((i,accuracy))
    i = 0
    maxAccuracy = 0
    while i < len(accuracyMatrix):
        if accuracyMatrix[i][1] > maxAccuracy:
            maxtuple = accuracyMatrix[i]
            maxAccuracy = accuracyMatrix[i][1]
        i += 1
    
    print(maxtuple)
    return maxtuple

Belo we calculate the tfidf vector and array

In [13]:
tfidf_train, tfidf_valid, train_vect, valid_vect = tfidf(X_train, X_valid)

There are 4 models run in total. 2 for each loss function and one that predicts the validation set while the other predicts the training data. 

In [14]:
labels = [0,1,2,3,4]
m = model("frobenius", "cd").fit(tfidf_train)
print(f'trying with {m}')
yhat = predict(m.transform(tfidf_valid))
label_order, accuracy = label_permute(y_valid, yhat,labels)

print(label_order)
print(accuracy)

m = model("kullback-leibler", "mu").fit(tfidf_train)
print(f'trying with {m}')

yhat_valid = predict(m.transform(tfidf_valid))
label_order, accuracy = label_permute(y_valid, yhat, labels)

print(label_order)
print(accuracy)

m = model("frobenius", "mu").fit(tfidf_train)
print(f'trying with {m}')
yhat = predict(m.transform(tfidf_train))
label_order, accuracy = label_permute(y_train, yhat,labels)

print(label_order)
print(accuracy)

m = model("kullback-leibler", "mu").fit(tfidf_train)
print(f'trying with {m}')
yhat = predict(m.transform(tfidf_train))
label_order, accuracy = label_permute(y_train, yhat,labels)
print(label_order)
print(accuracy)

trying with NMF(alpha_H=5e-05, alpha_W=5e-05, init='nndsvdar', l1_ratio=0.1, n_components=5,
    random_state=1)
((4, 3, 1, 0, 2), 0.9105691056910569)
(4, 3, 1, 0, 2)
0.9105691056910569
trying with NMF(alpha_H=5e-05, alpha_W=5e-05, beta_loss='kullback-leibler', init='nndsvdar',
    l1_ratio=0.1, n_components=5, random_state=1, solver='mu')
((4, 3, 1, 0, 2), 0.9105691056910569)
(4, 3, 1, 0, 2)
0.9105691056910569
trying with NMF(alpha_H=5e-05, alpha_W=5e-05, init='nndsvdar', l1_ratio=0.1, n_components=5,
    random_state=1, solver='mu')
((4, 3, 1, 0, 2), 0.8817635270541082)
(4, 3, 1, 0, 2)
0.8817635270541082
trying with NMF(alpha_H=5e-05, alpha_W=5e-05, beta_loss='kullback-leibler', init='nndsvdar',
    l1_ratio=0.1, n_components=5, random_state=1, solver='mu')
((4, 3, 1, 0, 2), 0.9478957915831663)
(4, 3, 1, 0, 2)
0.9478957915831663


# 1.3 COMPARISON

Next I predicted using a kmeans clustering algorihtm as I had the code already written from a previous project.

In [15]:
def kclass():
    kmodel = KMeans(n_clusters=5)
    kmodel.fit(tfidf_train)
    return kmodel

In [16]:
kmodel = kclass()
yhat = kmodel.predict(tfidf_valid)
labelorder, acc = label_permute(y_valid, yhat,labels)
#confusion(label, kmeans.labels_,labelorder)

((2, 3, 4, 1, 0), 0.8495934959349594)


Below I run supervised training algorithm, a gradient boosting classifier. I use both types of array. The count as well as teh tfidf to see which one is more accurate dataset. 

In [17]:
n_features = 500
X_train_s_c, X_valid_s_c, y_train_s_c, y_valid_s_c = count_vectorization(n_features)
X_train_s, X_valid_s, y_train_s, y_valid_s = tfidf_vectorization(n_features)
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0)
clf.fit(X_train_s, y_train_s)
print(clf.score(X_valid_s, y_valid_s))
clf = GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_depth=5, random_state=0)
clf.fit(X_train_s_c, y_train_s_c)
print(clf.score(X_valid_s, y_valid_s))

0.9329268292682927
0.34552845528455284


# Part 2. Limitation(s) of sklearn’s non-negative matrix factorization library.

In this part I will use nnm to predict movie ratings and discuss the limitations of nnm.

In [18]:
movies = pd.read_csv('../input/moviesreal/movies.csv')
train = pd.read_csv('../input/moviesreal/train.csv')
test = pd.read_csv('../input/moviesreal/test.csv')
users = pd.read_csv('../input/moviesreal/users.csv')


# 2.1 MODEL and RMSE

Below I create a training and validation set for the movie data and run the nmf models to predict the rating.

I use RMSE to evaluate each prediction. The inuition behind RMSE is how far off from the real solution I am.

For example, the RMSE ranges from 2 to 3 across the models. This means I that I could be up to 3 rating levels away from the true rating.

In [19]:
data = pd.concat([train, movies], axis="columns", join = 'inner')
y = data.pop('rating')
data.drop('title', axis =1, inplace = True)

X_train_movie, X_valid_movie, y_train_movie, y_valid_movie = train_test_split(data, y, test_size=0.33, random_state=42)

components = [1,2,3,4,5]
print(y_train_movie.unique())

m = model("frobenius", "cd").fit(X_train_movie)
print(f'trying with {m}')
yhat = predict(m.transform(X_valid_movie))
label_order, accuracy = label_permute(y_valid_movie, yhat,components)
rmse = mean_squared_error(y_valid_movie, yhat, squared=False)

print(f'RMSE {rmse}')
      
m = model("kullback-leibler", "mu").fit(X_train_movie)
print(f'trying with {m}')
yhat_valid = predict(m.transform(X_valid_movie))
label_order, accuracy = label_permute(y_valid_movie, yhat,components)
rmse = mean_squared_error(y_valid_movie, yhat, squared=False)

print(f'RMSE {rmse}')

m = model("frobenius", "mu").fit(X_train_movie)
print(f'trying with {m}')
yhat = predict(m.transform(X_train_movie))
label_order, accuracy = label_permute(y_train_movie, yhat,components)
rmse = mean_squared_error(y_train_movie, yhat, squared=False)

print(f'RMSE {rmse}')

m = model("kullback-leibler", "mu").fit(X_train_movie)
print(f'trying with {m}')
yhat = predict(m.transform(X_train_movie))
label_order, accuracy = label_permute(y_train_movie, yhat,components)
rmse = mean_squared_error(y_train_movie, yhat, squared=False)

print(f'RMSE {rmse}')

[5 3 4 2 1]
trying with NMF(alpha_H=5e-05, alpha_W=5e-05, init='nndsvdar', l1_ratio=0.1, n_components=5,
    random_state=1)
((2, 3, 1, 5, 4), 0.27613104524180965)
RMSE 2.5731118973323817




trying with NMF(alpha_H=5e-05, alpha_W=5e-05, beta_loss='kullback-leibler', init='nndsvdar',
    l1_ratio=0.1, n_components=5, random_state=1, solver='mu')
((2, 3, 1, 5, 4), 0.27613104524180965)
RMSE 2.5731118973323817




trying with NMF(alpha_H=5e-05, alpha_W=5e-05, init='nndsvdar', l1_ratio=0.1, n_components=5,
    random_state=1, solver='mu')
((1, 3, 4, 5, 2), 0.2964244521337947)
RMSE 2.587641048835078
trying with NMF(alpha_H=5e-05, alpha_W=5e-05, beta_loss='kullback-leibler', init='nndsvdar',
    l1_ratio=0.1, n_components=5, random_state=1, solver='mu')
((1, 3, 2, 5, 4), 0.33794694348327564)
RMSE 1.9153561037648177




# 2.2 Explanation of why NMF didnt succeed. 

NMF does best is sparce matrices. The movie ratings did not have enough information in the feature to come up with a good solution.

Below I test supervised learning on the dataset to see a better outcome. 

In [20]:
clf = GradientBoostingClassifier(n_estimators=5000, learning_rate=0.01, max_depth=5, random_state=0)
clf.fit(X_train_movie, y_train_movie)
print(clf.score(X_valid_movie, y_valid_movie))

0.3081123244929797
