In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import random
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
import json

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns 
import matplotlib.cm as cm
from matplotlib import rcParams
from prettytable import PrettyTable

import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.isri import ISRIStemmer
from collections import Counter 
import itertools
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from joblib import dump, load

In [None]:
df = pd.read_csv("/kaggle/input/arabic-classification/arabic_dataset_classifiction.csv/arabic_dataset_classifiction.csv")

df

In [None]:
print(np.sum(df.isnull().any(axis=1)))

clean_df = df.dropna()
print(np.sum(clean_df.isnull().any(axis=1)))

In [None]:
print (df.notnull().any(axis = 0))

In [None]:
def remove_hashtag(df, col = 'text'):
    for letter in r'#.][!XR':
        df[col] = df[col].astype(str).str.replace(letter,'', regex=True)
    

        
remove_hashtag(clean_df)
clean_df

# Punctuation tretment

In [None]:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

# Normalize

In [None]:
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

# Remove repetitions

In [None]:
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

In [None]:
def processDocument(doc, stemmer): 

    #Replace @username with empty string
    doc = re.sub(r'@[^\s]+', ' ', doc)
    doc = re.sub(r'_', ' ', doc)
    doc = re.sub(r'\n', ' ', doc)
    doc = re.sub(r'[a-z,A-Z]', '', doc)
    doc = re.sub(r'\d', '', doc)
    #Convert www.* or https?://* to " "
    doc = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',doc)
    #Replace #word with word
    doc = re.sub(r'#([^\s]+)', r'\1', doc)
    # remove punctuations
    doc= remove_punctuations(doc)
    # normalize the tweet
    doc= normalize_arabic(doc)
    # remove repeated letters
    doc=remove_repeating_char(doc)
    #stemming
    doc = stemmer.stem(doc)
    
    return doc

stemmer = ISRIStemmer()
clean_df["text"] = clean_df['text'].apply(lambda x: processDocument(x, stemmer))
clean_df

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
clean_df["text"] = clean_df["text"].apply(tokenizer.tokenize)
# print(clean_df['text'].values[0])
clean_df

In [None]:
stopwords_list = stopwords.words('arabic')
listToStr = ' '.join([str(elem) for elem in stopwords_list]) 
print(listToStr)

In [None]:
clean_df["text"] = clean_df["text"].apply(lambda x: [item for item in x if item not in stopwords_list])
clean_df

In [None]:
def countPropetries(df):
    all_words = [word for tokens in df["text"] for word in tokens]
    sentence_lengths = [len(tokens) for tokens in df["text"]]

    VOCAB = sorted(list(set(all_words)))

    print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
    print("Max sentence length is %s" % max(sentence_lengths))
    return all_words

In [None]:
culture_df = clean_df.loc[clean_df["targe"] == 0]
diverse_df = clean_df.loc[clean_df["targe"] == 1]
economy_df = clean_df.loc[clean_df["targe"] == 2]
politic_df = clean_df.loc[clean_df["targe"] == 3]
sport_df = clean_df.loc[clean_df["targe"] == 4]

print("Culture : ")
culture_words = countPropetries(culture_df)
print("\nDiverse : ")
diverse_words = countPropetries(diverse_df)
print("\nEconomy : ")
economy_words = countPropetries(economy_df)
print("\nPolitics : ")
politic_words = countPropetries(politic_df)
print("\nSport : ")
sport_words = countPropetries(sport_df)

In [None]:
def plot(all_words, title):
    counted_words = Counter(all_words)

    words = []
    counts = []
    for letter, count in counted_words.most_common(25):
        words.append(letter)
        counts.append(count)

    colors = cm.rainbow(np.linspace(0, 1, 10))
    rcParams['figure.figsize'] = 20, 10

    plt.title(title)
    plt.xlabel('Count')
    plt.ylabel('Words')
    plt.barh(words, counts, color=colors)

In [None]:
plot(culture_words, 'Top words in Culture')

In [None]:
plot(diverse_words, 'Top words in Diverse')

In [None]:
plot(economy_words, 'Top words in Economy')

In [None]:
plot(politic_words, 'Top words in Politics')

In [None]:
plot(sport_words, 'Top words in Sport')

In [None]:
sns.countplot(data= clean_df, x = "targe")
plt.show()

# Machine learing model training

In [None]:
y = clean_df['targe']
X = clean_df['text']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)

In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 1),
    max_features =10000)

In [None]:
model = Pipeline([
                    ("tfidf", word_vectorizer), # convert words to numbers using tfidf
                    ("clf", SVC()) # model the text
])

# Fit the pipeline to the training data
model.fit(X_train.astype('str'), y_train)

In [None]:
dump(model, "svm_model.joblib")

# Model Evaluation

In [None]:
def calculate_results(y_true, y_pred):
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

y_pred = model.predict(X_test.astype('str'))
result = calculate_results(y_test, y_pred)
result

In [None]:
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    fig = plt.figure(figsize = (10,6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd' 
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 fontsize=20,
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label', fontsize=20)
    plt.xlabel('Predicted label', fontsize=20)

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm, ['culture', 'diverse', 'economy', 'politic', 'sport'])

In [None]:
data = json.load(open("/kaggle/input/twitterdata-dev-morocco/arabic.json", encoding='utf-16'))
df = pd.DataFrame(data, columns=["date", "content"])

df

In [None]:
def prepareData(df):
    remove_hashtag(df, 'content')
    df["content"] = df['content'].apply(lambda x: processDocument(x, stemmer))
    tokenizer = RegexpTokenizer(r'\w+')
    df["content"] = df["content"].apply(tokenizer.tokenize)
    stopwords_list = stopwords.words('arabic')
    df["content"] = df["content"].apply(lambda x: [item for item in x if item not in stopwords_list])
    return df

prepared_df = prepareData(df)
prepared_df

In [None]:
model = load("../input/svm-model/svm_model.joblib")

In [None]:
y_pred = model.predict(prepared_df["content"].astype('str'))

ax = sns.histplot(y_pred, discrete=True)
ax.set_xticklabels(["", "culture", "diverse", "economy", "politics", "sport"])

In [None]:
y_culture = y_pred[y_pred == 0]
y_diverse = y_pred[y_pred == 1]
y_economy = y_pred[y_pred == 2]
y_politics = y_pred[y_pred == 3]
y_sport = y_pred[y_pred == 4]

In [None]:
table = {
    "label" : ["culture", "diverse", "economy", "politics", "sport"],
    "count" : [len(y_culture), len(y_diverse), len(y_economy), len(y_politics), len(y_sport)],
    "percentage" : [(len(y_culture) / len(y_pred)) * 100,
                    (len(y_diverse) / len(y_pred)) * 100,
                    (len(y_economy) / len(y_pred)) * 100,
                    (len(y_politics) / len(y_pred)) * 100,
                    (len(y_sport) / len(y_pred)) * 100]
}

stat = pd.DataFrame(table)
stat

In [None]:
result = [[], [], [], [], []]
for i in range(2):
    for index in range(len(y_pred)):
        if y_pred[index] == 0:
            result[0].append(index)
        elif y_pred[index] == 1:
            result[1].append(index)
        elif y_pred[index] == 2:
            result[2].append(index)
        elif y_pred[index] == 3:
            result[3].append(index)
        elif y_pred[index] == 4:
            result[4].append(index)

In [None]:
for i in range(5):
    print(table["label"][i] + " :\n" + df.iloc[random.choice(result[i])]["content"] + "\n\n")