In [1]:
import requests, time
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
import functools
import operator
import string
from sklearn.model_selection import train_test_split
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

from math import sqrt

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

url = 'https://clientsfromhell.net/'

def pipe(obj, *fns):
    return functools.reduce(lambda x, y: y(x), [obj] + list(fns))

def get_categories(url):
    html = requests.get(url)
    response = bs(html.content, features="html.parser")
    get_items = [category for category in response.find_all('li', {'class':'flex items-center'})]
    categories = ['Dunces','Criminals','Deadbeats','Racists','Homophobes','Sexist','Frenemies','Cryptic','Ingrates','Chaotic Good']
    category_pair = []
    for item in get_items:
        href = item.find('a').get('href')
        item_name = re.sub('\\n','',item.text)
        pair = (item_name, href)
        if item_name in categories:
            category_pair.append(pair)
    return list(set(category_pair))

def url_categroy_creator(list_categories):

    list_url_patters = []

    for cat in list_categories:
        pattern = 'https://clientsfromhell.net'+cat[1]+'page/' # regex pattern for the urls to scrape
        list_url_patters.append((pattern,cat[0]))

    return list_url_patters
        
def page_num_creator(url_category_list : list):
    list_url_num =[]
    for url in url_category_list:
        html = requests.get(url[0]+'1')
        response = bs(html.content, "html.parser")
        list_items = response.find_all('a',{'class':'page-numbers'})

        len_=len(list_items)-2
        max_pag=list_items[len_].text
        list_url_num.append((url[0],max_pag,url[1]))
    return list_url_num

class IronhackSpider:
    
    def __init__(self, url_pattern, pages_to_scrape=1, sleep_interval=-1, content_parser=None):
        self.url_pattern = url_pattern
        self.pages_to_scrape = pages_to_scrape
        self.sleep_interval = sleep_interval
        self.content_parser = content_parser
  
    def scrape_url(self, url):
        response = requests.get(url)
        result = self.content_parser(response.content)
        return result
            
    def kickstart(self):
        list_pages = []
        for i in range(1, self.pages_to_scrape+1):
            list_pages.append(self.scrape_url(self.url_pattern % i))            
        return list_pages

def content_parser(content):
    return content

def case_parser(content):
    all_content = bs(content, "html.parser")
    pre_content = all_content.select('div [class="w-blog-post-content"] > p')
    
    case=[]
    
    for i, el in enumerate(pre_content):
        text = el.text
        case.append(text)

    return case

def initialize_scraping(url_pagenum_cat_list : list):
    
    html_cont_dict = {}

    for URL_PATTERN, PAGES_TO_SCRAPE, CAT in url_pagenum_cat_list:

        my_spider = IronhackSpider(URL_PATTERN+'%s/', int(PAGES_TO_SCRAPE), content_parser=case_parser)

        content = my_spider.kickstart()
        
        html_cont_dict.update({CAT: content})
        
    return html_cont_dict

def stem(sentence : string):
    p = PorterStemmer()
    sentence = [p.stem(word) for word in sentence]
    return sentence

def cleaning(df : pd.DataFrame):
    
    for col in df:

        for i,list_ in enumerate(df[col]):
            
            sub_list=[]

            for item in list_:
                if item.startswith('Client:'):
                    sub_list.append(item)

            df[col][i] = sub_list
    

    punc_list = [x for x in string.punctuation]

    for col in df:

        for i,list_ in enumerate(df[col]):

            sub_list = [x.replace('\xa0|\n|Client: ', ' ') for x in df[col][i]]
            
            for punc in punc_list:
                sub_list = [x.replace(punc, '') for x in sub_list]
                
            sub_list = [x.replace('—|   |  ', '').rstrip() for x in sub_list]

            df[col][i] = sub_list
            

    for col in df:

        for i,list_ in enumerate(df[col]):

            sub_list = [x.split(' ') for x in list_]

            df[col][i] = sub_list
            df[col][i] = [word.lower() for words in df[col][i] for word in words if len(word) != 1]
            df[col][i] = [re.sub(r'^(.)\1+', r'\1', word)  for word in df[col][i]]
            df[col][i] = [word.replace("’", "'") for word in df[col][i]]
            df[col][i] = [word.replace("client", "") for word in df[col][i]]
            df[col][i] = [word.rstrip("'") for word in df[col][i]]

            df[col][i] = [word for word in df[col][i] if word not in stopwords.words('english')]
            df[col][i] = [word for word in df[col][i] if word.isalpha() == True]
            df[col][i] = [word for word in df[col][i] if len(word) != 1]
            df[col][i] = stem(df[col][i])

    
    df_final = df.transpose()

    df_final.columns = [str(col) for col in df_final.columns]

    df_final.reset_index(inplace = True)
    df_final.rename(columns = {'index':'category'}, inplace = True)

    df_cases = pd.DataFrame(columns = ['category', 'case'])

    for col in df_final:
        if col != 'category':
            df_cases = df_cases.append(df_final[['category', col]].rename(columns = {col:'case'}))

    df_cases.reset_index(drop = True, inplace = True)

    for i, row in enumerate(df_cases['case']):
        if row == []:
            df_cases.drop(index = i, inplace = True)

    df_cases['case'] = df_cases['case'].apply(lambda x: ' '.join(x))
    df_cases.reset_index(drop = True, inplace = True) #ADDED

    return df_cases

var = pipe(url, get_categories, url_categroy_creator, page_num_creator, initialize_scraping)
# df_clients_og = pd.DataFrame.from_dict(var, orient = 'index').fillna('').transpose()
# df_test = cleaning(df_clients_og)



ConnectionError: HTTPSConnectionPool(host='clientsfromhell.net', port=443): Max retries exceeded with url: /tag/ignoramus/page/78/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x11de3bdd0>: Failed to establish a new connection: [Errno 60] Operation timed out'))

In [None]:
def df_creator(dic_):
    df_ = pd.DataFrame.from_dict(var, orient = 'index').fillna('').transpose()
    return df_

def catetory_replacer(df, col = 'category'):
    dic_cat = {}
    
    for i, cat in enumerate(list(df[col].unique())):
        dic_cat[cat] = i
     
    df[col].replace(to_replace = dic_cat, inplace = True)
    
    return df, dic_cat

In [None]:
def t_t_split(df, target_col = 'category'):
    
    features = df[[col for col in df.columns if col != target_col]]
    target = df[[target_col]]

    
    X_train, X_test, y_train, y_test = train_test_split(
        features, # Features (X)
        target, # Target (y)
        test_size = .2,
        random_state = 42
    )
    return X_train, X_test, y_train, y_test

In [None]:
def list_split(string_):
    list_ = string_.split()
    return list_

def convert_to_word_col(df, case_col = 'case', target_col = 'category'):
    
    series_ = df[case_col].apply(lambda x: list_split(x))
    series_ = series_.apply(lambda x: dict(Counter(x)))
    
    df_series = pd.DataFrame(series_)
    df_count = pd.DataFrame()
    
    for i in range(df_series.shape[0]):
        df_count = df_count.append(pd.DataFrame(df_series[case_col][i], index=[0]))
        
    df_count.reset_index(drop = True, inplace = True)
        
    df_ = df[[target_col]]
    df_ = df_.merge(df_count, left_index=True, right_index= True)
    df_.fillna(0, inplace = True)
    
    return df_


In [None]:
def all_num_models_fitting(X_train, y_train):

    log_regr = LogisticRegression(solver = 'lbfgs')
    log_regr.fit(X_train, y_train.values.ravel())

    knn = KNeighborsClassifier(n_neighbors = 3) # k = 5 by default
    knn.fit(X_train, y_train.values.ravel())

    multi = MultinomialNB()
    multi.fit(X_train, y_train.values.ravel())

    rfc = RandomForestClassifier(max_depth=10, random_state=42)
    rfc.fit(X_train, y_train.values.ravel())
    
    return log_regr, knn, multi, rfc
    
    
    

In [None]:
def all_bool_models_fitting(X_train, y_train):
    
    X_train_bool = X_train.astype(bool)
    X_test_bool = X_test.astype(bool)

    bernoulli = BernoulliNB().fit(X_train_bool, y_train.values.ravel())

    guassian = GaussianNB().fit(X_train, y_train.values.ravel())
    
    return X_test_bool, bernoulli, guassian

In [None]:
def model_score(model, X_test, y_test):
    
    score = model.score(X_test, y_test)*100
    
    if score >= 50.0:
        
        print('Score: ',score,'%')
        print("DON'T GET COCKY NOW!!! KEEP MAKING IT BETTER!")
        print('')
    elif score < 50.0:
        
        print('Score: ',score,'%')
        print("Your algorithm stinks so much, I could toss a coin and make better predictions =P...")
        print('')
    
    return score

In [None]:
def predict(model, X_test):
    prediction = model.predict(X_test)
    return prediction
    

In [None]:
def model_metrics(y_test, prediction):

    r2 = r2_score(y_test, prediction)

    mse = mean_squared_error(y_test, prediction)

    rmse = sqrt(mean_squared_error(y_test, prediction))

    mae = mean_absolute_error(y_test, prediction)

    acc = accuracy_score(y_test, prediction)
    
    bacc = balanced_accuracy_score(y_test, prediction)

    prec = precision_score(
        y_test,
        prediction,
        pos_label = 2,
        average = 'weighted'
    )
    
    rec = recall_score(
    y_test,
    prediction,
    pos_label = 2,
    average = 'weighted'
    )

    f1 = f1_score(
        y_test,
        prediction,
        pos_label = 2,
        average = 'weighted'
    )
    
    return r2, mse, rmse, mae, acc, bacc, prec, rec, f1

In [None]:
def run_all_models_and_score(df):

    X_train, X_test, y_train, y_test = t_t_split(df)

    models = all_num_models_fitting(X_train, y_train) #log_regr, knn, multi, rfc

    X_test_bool, ber, gua = all_bool_models_fitting(X_train, y_train)
    
    models = models + (ber, gua)
    
    metrics_names = ['R2: ', 'MSE: ', 'RMSE: ', 'MAE: ',
                     'Accuracy: ', 'Balanced Acc: ', 'Precision: ',
                     'Recall: ', 'F1 Score: ']
    
    model_names = ['Log Regression', 'KNN', 'Multinomial', 'Random Forest', 'Bernoulli', 'Gaussian']
    
    for i, model in enumerate(models):
        if i < 4:
            
            print(model_names[i])
            print('')
            model_score(model, X_test, y_test) #returns score and prints it
            for j, name in enumerate(metrics_names):
                  print(name, model_metrics(y_test, predict(model, X_test))[j]) #r2, mse, rmse, mae, acc, bacc, prec, rec, f1
            print('')
        else:
            
            print(model_names[i])
            print('')
            model_score(model, X_test, y_test) #returns score and prints it
            for j, name in enumerate(metrics_names):
                  print(name, model_metrics(y_test, predict(model, X_test_bool))[j]) #r2, mse, rmse, mae, acc, bacc, prec, rec, f1
            print('')

In [None]:
pipe(pipe(var, df_creator, cleaning, catetory_replacer)[0], convert_to_word_col, run_all_models_and_score)