## Arabic News Classification 
## (Entrepreneurship, Science & Technology, Other)

### Import Libraries

In [None]:
#!/usr/bin/python3

import warnings
warnings.filterwarnings('ignore')
import os, pickle, re, string
from string import punctuation
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot.metrics as skplt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix

### Loading the datasets

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv(os.path.join(dirname, filename))
df = df.dropna()
df

In [None]:
df['Category'].value_counts()

## Modeling (Training & Testing)

In [None]:
# splitting the data into target and feature
Category_lebels = {"أخرى":3 , "علوم وتكنولوجيا":2, "ريادة أعمال":1}
df = df.replace({"Category": Category_lebels})
feature = df.Content
target = df.Category

# splitting into train and tests
X_train, X_test, Y_train, Y_test = train_test_split(feature, target, test_size =.2, random_state=100)

In [None]:
def draw_conf_matrix(model, X_test, Y_test, labels):
    #Creating matplotlib axes object to assign figuresize and figure title
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.set_title('Confusion Matrx')

    disp = plot_confusion_matrix(model, X_test, Y_test, display_labels= labels, ax = ax)
    disp.confusion_matrix

In [None]:
# make pipeline
pipe = make_pipeline(TfidfVectorizer(), LogisticRegression())

# make param grid
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}

# create and fit the model
lr = GridSearchCV(pipe, param_grid, cv=5)
lr.fit(X_train,Y_train)

# make prediction and print accuracy
prediction = lr.predict(X_test)
predicted_proba = lr.predict_proba(X_test)

print(f"Accuracy score is {accuracy_score(Y_test, prediction):.2f}")
print(classification_report(Y_test, prediction))
draw_conf_matrix(lr, X_test, Y_test, Category_lebels.values())
skplt.plot_roc_curve(Y_test, predicted_proba)

In [None]:
pickle.dump(lr, open('lr', 'wb'))

In [None]:
#df['Content'][3000] = df['Content'].apply(preprocess)[3000]
out = lr.predict([df['Content'][3000]])
list(Category_lebels.keys())[list(Category_lebels.values()).index(out[0])]