In [2]:
Sports_URLs = ['https://en.wikipedia.org/wiki/Football',
        'https://en.wikipedia.org/wiki/Cricket',
        'https://en.wikipedia.org/wiki/Badminton',
        'https://en.wikipedia.org/wiki/Basketball',
        'https://en.wikipedia.org/wiki/Hockey']

Education_URLs = ['https://en.wikipedia.org/wiki/School',
        'https://en.wikipedia.org/wiki/College',
        'https://en.wikipedia.org/wiki/University',
        'https://en.wikipedia.org/wiki/Professor',
        'https://en.wikipedia.org/wiki/Teacher']

In [3]:
import requests
from bs4 import BeautifulSoup
import re

def clean_text(text):
    text = text.lower()
    text = ''.join(e for e in text if e.isalnum() or e == ' ')
    tokens = text.split()
    stop_words = set(["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"])
    tokens = [word for word in tokens if word not in stop_words]
    text = " ".join(tokens)
    return text

def get_text(URL):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    Ps = soup.find_all("p")
    text = ""
    for p in Ps:
        text += p.text.strip()
    return clean_text(text)

Sports_text = [get_text(URL) for URL in Sports_URLs]
Education_text = [get_text(URL) for URL in Education_URLs]

In [4]:
Sports_text

['football family team sports involve varying degrees kicking ball score goal unqualified word football normally means form football popular word used sports commonly called football include association football known soccer australia canada south africa united states sometimes ireland new zealand australian rules football gaelic football gridiron football specifically american football arena football canadian football international rules football rugby league football rugby union football1 various forms football share varying degrees common origins known football codesthere number references traditional ancient prehistoric ball games played many different parts world234 contemporary codes football traced back codification games english public schools 19th century outgrowth medieval football56 expansion cultural power british empire allowed rules football spread areas british influence outside directly controlled empire7 end 19th century distinct regional codes already developing gaeli

In [5]:
Education_text

['school educational institution building designed provide learning spaces learning environments teaching students direction teachers countries systems formal education sometimes compulsory2 systems students progress series schools built operated government private organization names schools vary country discussed regional terms section generally include primary school young children secondary school teenagers completed primary education institution higher education taught commonly called university college universityin addition core schools students given country may also attend schools primary elementary us secondary middle school us education3 kindergarten preschool provide schooling young children typically ages 35 university vocational school college seminary may available secondary school school may dedicated one particular field school economics dance alternative schools may provide nontraditional curriculum methodsnongovernment schools also known private schools4 may required g

In [6]:
import pandas as pd

df = pd.DataFrame({
    "text": Sports_text + Education_text,
    "category": ["Sports"]*len(Sports_text) + ["Education"]*len(Education_text)
})

df

Unnamed: 0,text,category
0,football family team sports involve varying de...,Sports
1,firstclass cricketone day internationallimited...,Sports
2,badminton racquet sport played using racquets ...,Sports
3,basketball team sport two teams commonly five ...,Sports
4,hockey term used denote family various types s...,Sports
5,school educational institution building design...,Education
6,college latin collegium educational institutio...,Education
7,university latin universitasa whole institutio...,Education
8,professor commonly abbreviated prof1 academic ...,Education
9,teacher also called schoolteacher formally edu...,Education


In [15]:
from collections import Counter
import numpy as np

def get_unigram_counts(texts):
    unigram_counts = Counter()
    for text in texts:
        unigram_counts.update(text.split())
    return unigram_counts

unigram_counts = get_unigram_counts(df["text"])
unigram_counts

def get_unigram_count_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            matrix[i, j] = counts[word]
    return matrix

unigram_count_matrix = get_unigram_count_matrix(df["text"], unigram_counts)
unigram_count_matrix

unigram_count_df = pd.DataFrame(unigram_count_matrix, columns=unigram_counts.keys())
unigram_count_df

Unnamed: 0,football,family,team,sports,involve,varying,degrees,kicking,ball,score,...,ancestors,succeeding,never,really,secure,formative,minds,spirits,nurtured,truths
0,209.0,1.0,13.0,13.0,2.0,2.0,2.0,13.0,86.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6.0,0.0,44.0,3.0,1.0,0.0,0.0,0.0,58.0,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,2.0,5.0,0.0,0.0,0.0,0.0,5.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,0.0,54.0,8.0,0.0,2.0,1.0,1.0,99.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,1.0,5.0,13.0,0.0,1.0,0.0,0.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,1.0,0.0,0.0,26.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
unigram_count_df.shape

(10, 8499)

In [17]:
def get_bigram_counts(texts):
    bigram_counts = Counter()
    for text in texts:
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        bigram_counts.update(bigrams)
    return bigram_counts

bigram_counts = get_bigram_counts(df["text"])
bigram_counts

def get_bigram_prob_matrix(texts, bigram_counts, unigram_counts):
    matrix = np.zeros((len(texts), len(bigram_counts)))
    for i, text in enumerate(texts):
        words = text.split()
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        for j, bigram in enumerate(bigram_counts):
            matrix[i, j] = bigram_counts[bigram]/unigram_counts[bigram[0]]
    return matrix

bigram_prob_matrix = get_bigram_prob_matrix(df["text"], bigram_counts, unigram_counts)
bigram_prob_matrix

bigram_prob_df = pd.DataFrame(bigram_prob_matrix, columns=bigram_counts.keys())
bigram_prob_df

Unnamed: 0_level_0,football,family,team,sports,involve,varying,degrees,kicking,ball,score,...,formative,years,minds,spirits,nurtured,school,environment,embodies,basic,american
Unnamed: 0_level_1,family,team,sports,involve,varying,degrees,kicking,ball,score,goal,...,years,minds,spirits,nurtured,school,environment,embodies,basic,american,truths
0,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519
1,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519
2,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519
3,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519
4,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519
5,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519
6,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519
7,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519
8,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519
9,0.004484,0.333333,0.042373,0.023256,0.2,0.6,0.022222,0.214286,0.003802,0.033333,...,1.0,0.019231,1.0,1.0,1.0,0.004405,0.125,0.5,0.0625,0.018519


In [18]:
bigram_prob_df.shape

(10, 29286)

In [19]:
def get_tf_matrix(texts, unigram_counts):
    matrix = np.zeros((len(texts), len(unigram_counts)))
    for i, text in enumerate(texts):
        counts = Counter(text.split())
        for j, word in enumerate(unigram_counts):
            matrix[i, j] = counts[word]
    return matrix

tf_matrix = get_tf_matrix(df["text"], unigram_counts)
tf_matrix

def get_idf_vector(texts, unigram_counts):
    idf_vector = np.zeros(len(unigram_counts))
    for j, word in enumerate(unigram_counts):
        idf_vector[j] = np.log(len(texts)/sum([1 for text in texts if word in text]))
    return idf_vector

idf_vector = get_idf_vector(df["text"], unigram_counts)
idf_vector

def get_tfidf_matrix(tf_matrix, idf_vector):
    return tf_matrix*idf_vector

tfidf_matrix = get_tfidf_matrix(tf_matrix, idf_vector)
tfidf_matrix

tfidf_df = pd.DataFrame(tfidf_matrix, columns=unigram_counts.keys())
tfidf_df

Unnamed: 0,football,family,team,sports,involve,varying,degrees,kicking,ball,score,...,ancestors,succeeding,never,really,secure,formative,minds,spirits,nurtured,truths
0,144.867761,1.203973,6.640733,4.636774,0.446287,2.407946,0.71335,20.922693,43.931004,1.070025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.158883,0.0,22.476327,1.070025,0.223144,0.0,0.0,0.0,29.627886,4.636774,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.693147,0.0,1.021651,1.783375,0.0,0.0,0.0,0.0,2.554128,2.14005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.465736,0.0,27.584584,2.8534,0.0,2.407946,0.356675,1.609438,50.571737,2.496725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.386294,1.203973,2.554128,4.636774,0.0,1.203973,0.0,0.0,7.662384,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.71335,0.0,0.0,0.356675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.356675,0.0,0.0,9.273549,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,3.210074,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.71335,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.203973,0.0,0.0,0.446287,0.0,1.070025,0.0,0.0,0.0,...,2.302585,2.302585,0.510826,2.302585,2.302585,2.302585,1.609438,2.302585,2.302585,2.302585


In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(unigram_count_matrix, df["category"], test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

In [21]:
X_train, X_test, y_train, y_test = train_test_split(bigram_prob_matrix, df["category"], test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.5

In [22]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df["category"], test_size=0.2, random_state=42)

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

1.0