In [1]:
# importing the needed libraries
import pandas as pd
import os
from time import perf_counter, time
import joblib
import random
import string
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, NuSVC
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer
from nltk.corpus import stopwords 

!pip install Arabic-Stopwords
import arabicstopwords.arabicstopwords as stp




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
seed = 7

# fix random number generation aka regenerate the same random numbers every time (such as weight and bias initialization )
def set_random_seed(seed):
    """Set random seed, for python, numpy

    Args:
        seed (int): Seed to be used.
    """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
set_random_seed(seed=seed)

In [4]:
# Reading the files
df1 = pd.read_csv('./resourses/archive/stories_art-et-culture.csv', index_col='id')
df2 = pd.read_csv('./resourses/archive/stories_economie.csv', index_col='id')
df3 = pd.read_csv('./resourses/archive/stories_faits-divers.csv', index_col='id')
df4 = pd.read_csv('./resourses/archive/stories_marocains-du-monde.csv', index_col='id')
df5 = pd.read_csv('./resourses/archive/stories_medias.csv', index_col='id')
df6 = pd.read_csv('./resourses/archive/stories_orbites.csv', index_col='id')
df7 = pd.read_csv('./resourses/archive/stories_politique.csv', index_col='id')
df8 = pd.read_csv('./resourses/archive/stories_regions.csv', index_col='id')
df9 = pd.read_csv('./resourses/archive/stories_societe.csv', index_col='id')
df10 = pd.read_csv('./resourses/archive/stories_sport.csv', index_col='id')
df11 = pd.read_csv('./resourses/archive/stories_tamazight.csv', index_col='id')

In [5]:
df1.head()

Unnamed: 0_level_0,Unnamed: 0,title,date,author,story,topic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
f06aa998054e11eba66e646e69d991ea,0,"""بيت الشعر"" يسائل وزير الثقافة عن كوابيس سوداء",الجمعة 02 أكتوبر 2020 - 23:19,هسبريس من الرباط,"وجه ""بيت الشعر في المغرب"" إلى وزير الثقافة وال...",art-et-culture
f1cf1b9c054e11ebb718646e69d991ea,1,"مهرجان ""سينما المؤلّف"" يستحضر روح ثريا جبران",الجمعة 02 أكتوبر 2020 - 07:26,هسبريس من الرباط,في ظلّ استمرار حالة الطوارئ الصحية المرتبطة بج...,art-et-culture
f2d282a4054e11eb800f646e69d991ea,2,"فيلم ""بدون عنف"" لهشام العسري ..""كعب الحذاء ووا...",الجمعة 02 أكتوبر 2020 - 04:00,عفيفة الحسينات*,تشير مشاهدة فيلم قصير ضمن الثلاثية الأخيرة للم...,art-et-culture
f3f46cac054e11eba403646e69d991ea,3,"""تنين ووهان"" .. مريم أيت أحمد توقِّع أولى ""روا...",الجمعة 02 أكتوبر 2020 - 02:00,حاورَها: وائل بورشاشن,"مِن قَلب أيّام ""الحَجْر""، رأتِ النّورَ الفصول ...",art-et-culture
f50f0476054e11eba31b646e69d991ea,4,"مسكر يتخلّى عن دعم ""الوزارة"" بسبب ""الجمهور""",الخميس 01 أكتوبر 2020 - 19:40,هسبريس من الرباط,أعلن الفنان المغربيّ سعيد مسكر تخليه عن مبلغ ا...,art-et-culture


In [6]:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                             """, re.VERBOSE)

def remove_diacritics(text):
    '''this function to remove diacritics'''
    
    text = re.sub(arabic_diacritics, '', text)
    return text

def normalize_arabic(text):
    '''this function to convert some special arabic characters into more general equivalent ones'''
    
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)   # i keep this because this one contains useful feature as specially iraq uses this character
    return text

def remove_punctuations(text):
    '''this function to remove punctuation'''
    
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

# def remove_repeating_char(text):
#     return re.sub(r'(.)\1+', r'\1', text)

def remove_stop_words(text):
    '''this function is to remove stop-words using nltk arabic-stopwords'''
    
    stop_words = list(stopwords.words('arabic'))
    return ' '.join(word for word in text.split() if word not in stop_words)

# alternative method to remove_stop_words
def remove_stop_words(text):
    '''this function is to remove stop-words'''
    
    return " ".join(word for word in text.split() if not stp.is_stop(word))

def clean_text(text):
    # t = re.sub(r'#', ' ', t) # replace '#' with space
    # t = re.sub(r'_', ' ', t)    # replace '_' with space
    text = re.sub(r'[^\u0600-\u06FF]', ' ', text) # remove all characters except arabic ones
    text = remove_diacritics(text) 
    text = remove_punctuations(text) # remove the remained punctuations, actually the remains are only """ ?,; """
    text = normalize_arabic(text)    # replace speacial arabic characters with some how general ones
    text = re.sub(r' +', ' ', text)  # remove multiple spaces, also can done with re.sub(r'/\s\s+/g', ' ', text)
    text = remove_stop_words(text)
    # text = remove_repeating_char(text)
    return text
    

In [7]:
# making preprocessing to remove unwanted symbols
df1["preprocessed_story"] = [clean_text(text) for text in df1["story"]]
df2["preprocessed_story"] = [clean_text(text) for text in df2["story"]]
df3["preprocessed_story"] = [clean_text(text) for text in df3["story"]]
df4["preprocessed_story"] = [clean_text(text) for text in df4["story"]]
df5["preprocessed_story"] = [clean_text(text) for text in df5["story"]]
df6["preprocessed_story"] = [clean_text(text) for text in df6["story"]]
df7["preprocessed_story"] = [clean_text(text) for text in df7["story"]]
df8["preprocessed_story"] = [clean_text(text) for text in df8["story"]]
df9["preprocessed_story"] = [clean_text(text) for text in df9["story"]]
df10["preprocessed_story"] = [clean_text(text) for text in df10["story"]]
df11["preprocessed_story"] = [clean_text(text) for text in df11["story"]]

In [8]:
# concatenate the 11 files
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11], axis=0, ignore_index=False)
df.index.name="id"

In [9]:
# showing the last 5 rows
df.tail()

Unnamed: 0_level_0,Unnamed: 0,title,date,author,story,topic,preprocessed_story
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
75b0b940055811ebb2ff646e69d991ea,995,"نشطاء أمازيغ يدافعون عن ""الحقّ"" في استقبال إسر...",الثلاثاء 10 شتنبر 2013 - 02:00,هسبريس ـ ميمون أم العيد,دافع ناشطون أمازيغ استقبلوا أخيرا وفدا من الطل...,tamazight,دافع ناشطون امازيغ استقبلوا اخيرا وفدا الطلبه ...
775c3440055811ebbc60646e69d991ea,996,شاعرة أمازيغية تعتصم بالمطار لرفض استمارتها بـ...,الاثنين 09 شتنبر 2013 - 08:20,هسبريس ـ عبد المغيث جبران,خاضت الشاعرة الأمازيغية ملكية مزان اعتصاما لمد...,tamazight,خاضت الشاعره الامازيغيه ملكيه مزان اعتصاما لمد...
78fea9ba055811eb9e32646e69d991ea,997,وفد إسرائيلي يزور المغرب ويلتقي نشطاء أمازيغ ب...,الثلاثاء 03 شتنبر 2013 - 16:24,هسبريس - ماجدة أيت لكتاوي,أدانت المنسقية الوطنية للمبادرة الطلابية ضد ال...,tamazight,ادانت المنسقيه الوطنيه للمبادره الطلابيه ضد ال...
7a58d38c055811ebb9c9646e69d991ea,998,نقاش أمازيغيّ مؤنّث بطنجة يذكّر بكَوْن الحقوق ...,السبت 17 غشت 2013 - 10:30,هسبريس من طنجة,طالبت الناشطة الأمازيغية مريم الدمناتي بضرورة ...,tamazight,طالبت الناشطه الامازيغيه مريم الدمناتي بضروره ...
7b769274055811eba13c646e69d991ea,999,"أمازيغ يقتحمون مقر البرلمان الليبي مطالبين بـ""...",الثلاثاء 13 غشت 2013 - 21:00,محمد الناجم من طرابلس*,اقتحم المئات من المتظاهرين المنحدرين من الأقلي...,tamazight,اقتحم المءات المتظاهرين المنحدرين الاقليات الا...


In [10]:
# splitting each file into 80% training-set and the last 20% for testing-set
df1_train, df1_test = train_test_split(df1, train_size=0.8, test_size=0.2, shuffle=False)
df2_train, df2_test = train_test_split(df2, train_size=0.8, test_size=0.2, shuffle=False)
df3_train, df3_test = train_test_split(df3, train_size=0.8, test_size=0.2, shuffle=False)
df4_train, df4_test = train_test_split(df4, train_size=0.8, test_size=0.2, shuffle=False)
df5_train, df5_test = train_test_split(df5, train_size=0.8, test_size=0.2, shuffle=False)
df6_train, df6_test = train_test_split(df6, train_size=0.8, test_size=0.2, shuffle=False)
df7_train, df7_test = train_test_split(df7, train_size=0.8, test_size=0.2, shuffle=False)
df8_train, df8_test = train_test_split(df8, train_size=0.8, test_size=0.2, shuffle=False)
df9_train, df9_test = train_test_split(df9, train_size=0.8, test_size=0.2, shuffle=False)
df10_train, df10_test = train_test_split(df10, train_size=0.8, test_size=0.2, shuffle=False)
df11_train, df11_test = train_test_split(df11, train_size=0.8, test_size=0.2, shuffle=False)

# Alternative way to get the same result using pandas
# df1_train, df1_test = df1.head(800), df1.tail(200)
# df2_train, df2_test = df2.head(800), df2.tail(200)
# df3_train, df3_test = df3.head(800), df3.tail(200)
# df4_train, df4_test = df4.head(800), df4.tail(200)
# df5_train, df5_test = df5.head(800), df5.tail(200)
# df6_train, df6_test = df6.head(800), df6.tail(200)
# df7_train, df7_test = df7.head(800), df7.tail(200)
# df8_train, df8_test = df8.head(800), df8.tail(200)
# df9_train, df9_test = df9.head(800), df9.tail(200)
# df10_train, df10_test = df10.head(800), df10.tail(200)
# df11_train, df11_test = df11.head(800), df11.tail(200)

In [11]:
# concatenate the 11 training dfs
df_train = pd.concat([df1_train, df2_train, df3_train, df4_train, df5_train, df6_train,
                      df7_train, df8_train, df9_train, df10_train, df11_train], axis=0, ignore_index=False)

# separate the label from the feature_column
y_train = df_train.pop('topic')

In [26]:
# concatenate the 11 training dfs
df_test = pd.concat([df1_test, df2_test, df3_test, df4_test, df5_test, df6_test,
                      df7_test, df8_test, df9_test, df10_test, df11_test], axis=0, ignore_index=False)

# separate the label from the feature_column
y_test = df_test.pop('topic')

#### **i will use only the story column as i expect that it has the most useful information i need to do classification**

In [12]:
# selecting only the preprocessed_topic column
df_train = df_train['preprocessed_story']
# or we may squeeze it, the goal is to convert it into pd.Series
# df_train = df_train.squeeze()

# showing the first 5 rows after selecting of preprocessed_story to make sure that every thing is go as expected
df_train.head()

id
f06aa998054e11eba66e646e69d991ea    وجه بيت الشعر المغرب الي وزير الثقافه والشباب ...
f1cf1b9c054e11ebb718646e69d991ea    استمرار حاله الطوارء الصحيه المرتبطه بجاءحه كو...
f2d282a4054e11eb800f646e69d991ea    تشير مشاهده فيلم قصير الثلاثيه الاخيره للمخرج ...
f3f46cac054e11eba403646e69d991ea    قلب ايام الحجر رات النور الفصول الاولي روايه م...
f50f0476054e11eba31b646e69d991ea    اعلن الفنان المغربي سعيد مسكر تخليه مبلغ الدعم...
                                                          ...                        
944a0bb4055711ebb1b9646e69d991ea    طالبت العصبه الامازيغيه لحقوق الانسان الحكومه ...
953dd0ac055711ebb532646e69d991ea    اطار تخليد جمعيه تامزغا بمدريد لاحتفالات السنه...
96224fa4055711eb90c1646e69d991ea    الوقت تنظر الحركه الامازيغيه الرضا الي موقف حز...
96f78a2e055711eb969b646e69d991ea    قريه ملاعب الصغيره كلم الرشيديه حجمها الكبيره ...
97edc77a055711ebb043646e69d991ea    طالب خالد الزراري رءيس الكونغرس العالمي الاماز...
Name: preprocessed_story, Length: 8800, dtype: obje

- **NOTE:- the used features is my implementation of a paper published that get the SOTA in classifying tweets into it's arabic dialectic**

In [13]:
# initializing support vector machine model
# svm = NuSVC(random_state=seed, 
#             # kernel="linear"
#            )

svm = SVC(random_state=seed)

# creating pipeline of the needed feature 
pipeline = Pipeline([
                     ('cw26', FeatureUnion([
                                            ('word_features', Pipeline([
                                                                        ('ngram_w', CountVectorizer(ngram_range=(2, 3),
                                                                                                   analyzer='word')),
                                                                        ('tfidf_w', TfidfTransformer())
                                                                        ])),
                                            ('char_features', Pipeline([
                                                                        ('ngram_c', CountVectorizer(ngram_range=(2, 3), 
                                                                                                   analyzer='char')),
                                                                        ('tfidf_c', TfidfTransformer())
                                                                        ])),
                                             ])),
                     ('svm', svm)
                    ])

In [14]:
param_grid={"svm__kernel" : ['linear', 'rbf']}

def customized_func(ytrue, ypred):
    '''this customized function is just an example of how to use customized evaluation function'''
    
    return f1_score(ytrue, ypred)

refit = "f1"
scr = refit
scoring={"precision":make_scorer(precision_score), 
         "recall":make_scorer(recall_score),
         "f1":make_scorer(f1_score), 
         "score":make_scorer(accuracy_score), 
         "f1_scored": make_scorer(customized_func)}
         
# model = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, 
model = GridSearchCV(estimator=pipeline, 
                     param_grid=param_grid,                          
                     cv=2, 
                     scoring=scoring, 
                     refit=refit, 
                     n_jobs=1, 
                     return_train_score=True, 
                     verbose=3)

In [15]:
begin_moment = perf_counter()
print(f"begin moment = {begin_moment}")
model.get_params()
model.fit(df_train, y_train)
completion_moment = perf_counter()
elapsed_time = (completion_moment-begin_moment)//60
print(f"completion moment = {completion_moment}")
print(f"elabsed time is {elapsed_time//60} hours: {elapsed_time%60} minuts")

results = pd.DataFrame(model.cv_results_
                      )#.sort_values(by=f"rank_test_{scr}").loc[:,df.columns.str.contains('rank')]

# results[:5].filter(regex='rank | mean | params')

begin moment = 117.2255431
Fitting 2 folds for each of 2 candidates, totalling 4 fits


Traceback (most recent call last):
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 267, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1776, in precision_score
    p, _, _, _ = precision_recall_fscore_support(
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1563, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classifi

[CV 1/2] END svm__kernel=linear; f1: (train=nan, test=nan) f1_scored: (train=nan, test=nan) precision: (train=nan, test=nan) recall: (train=nan, test=nan) score: (train=nan, test=nan) total time=11.0min


Traceback (most recent call last):
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 267, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1776, in precision_score
    p, _, _, _ = precision_recall_fscore_support(
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1563, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classifi

[CV 2/2] END svm__kernel=linear; f1: (train=nan, test=nan) f1_scored: (train=nan, test=nan) precision: (train=nan, test=nan) recall: (train=nan, test=nan) score: (train=nan, test=nan) total time=11.2min


Traceback (most recent call last):
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 267, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1776, in precision_score
    p, _, _, _ = precision_recall_fscore_support(
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1563, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classifi

[CV 1/2] END svm__kernel=rbf; f1: (train=nan, test=nan) f1_scored: (train=nan, test=nan) precision: (train=nan, test=nan) recall: (train=nan, test=nan) score: (train=nan, test=nan) total time=12.8min


Traceback (most recent call last):
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 267, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1776, in precision_score
    p, _, _, _ = precision_recall_fscore_support(
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classification.py", line 1563, in precision_recall_fscore_support
    labels = _check_set_wise_labels(y_true, y_pred, average, labels, pos_label)
  File "C:\Users\huzyfa\anaconda3\lib\site-packages\sklearn\metrics\_classifi

[CV 2/2] END svm__kernel=rbf; f1: (train=nan, test=nan) f1_scored: (train=nan, test=nan) precision: (train=nan, test=nan) recall: (train=nan, test=nan) score: (train=nan, test=nan) total time=12.0min




completion moment = 5433.1975275
elabsed time is 1.0 hours: 28.0 minuts


IndexError: Boolean index has wrong length: 7 instead of 51

In [16]:
results = pd.DataFrame(model.cv_results_)

In [20]:
filename = './resourses/finalized_model.sav'

# save the model to disk
joblib.dump(model, filename)

# # some time later...

# load the model from disk
loaded_model = joblib.load(filename)

###########################################################################
# alternative saving method
# import pickle
# # # save the model to disk
# filename = 'finalized_model.sav'
# pickle.dump(model, open(filename, 'wb'))

# # some time later...

# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

In [18]:
results = pd.DataFrame(model.cv_results_
                      ).sort_values(by=f"rank_test_{scr}").loc[:,df.columns.str.contains('rank')]

results[:5].filter(regex='rank | mean | params')

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [19]:
results

NameError: name 'results' is not defined

In [21]:
model

In [22]:
loaded_model.best_score_

nan

In [24]:
best_model = model.best_estimator_
best_model.fit(df_train, y_train)

NameError: name 'df_test' is not defined

In [27]:
pred = best_model.predict(df_test)

In [29]:
sum(pred==y_test)

  sum(pred==y_test)


ValueError: ('Lengths must match to compare', (2200,), (6,))

In [None]:
# retraining the model with the best estimator
best_model = search.best_estimator_
best_model.fit(x, y)
# predicting of the test set data
pred = best_model.predict(df_test)

In [None]:
# presision is the actual correct prediction divided by total prediction made.
# i.e if the model predict that this sample is belongs to class A, what is the 
# probability that this sample is truthfully belongs to class A
# precision = TP/(TP+FP)
precision_score = precision_score(y_true=y_test, y_pred=df_test)

# recall is the percentage of the samples that belongs to class A, but the model didn't predict them belonging to class A
# recall is the number of true positives divided by the total number of true positives and false negatives
recall_score = recall_score(y_true=y_test, y_pred=df_test)

# f1 score is fatorization of precision and recall, with the goal that if one of them is very bad and the other is very good, 
# so f1-score will almost  the worse of them
# f1-score = 2*(precision*recall)/(precision+recall)
f1_score = f1_score(y_true=y_test, y_pred=df_test)

# accuracy is the percentage of the correct predictions to the overall predictions
accuracy_score = accuracy_score(y_true=y_test, y_pred=df_test)