In [240]:
import pandas as pd
import numpy as np

In [241]:
import os

# Loading the data into a list

In [242]:
# a list of lists to hold the data
data_list=[[] for i in range(len(os.listdir('data')))]
for n,folder in enumerate (os.listdir('data')):
    for file in os.listdir('data/'+folder):
        file_content=open('data/'+folder+'/'+file).read()
        data_list[n].append(file_content)

# How many files are in each category

In [243]:
for n,folder in enumerate(os.listdir('data')):
    print(str(len(data_list[n]))+'   '+folder)
    

219   رياضة
2356   اخبار الشرق الاوسط
1489   اخبار العالم
49   عرض الصحف
296   اقتصاد و اعمال
122   منوعات
232   علوم وتكنولوجيا


In [244]:
#categories
cats=os.listdir('data')

# Cleaning the data, replacing all non-alphanumeric characters with space

In [245]:
import re

In [246]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [247]:
for i in range(len(data_list)):
    for j in range(len(data_list[i])):
        new=' '.join([remove_html_tags(x) for x in data_list[i][j].split()])
        data_list[i][j]=re.sub(r'\W+', ' ', new)

In [248]:
def extract_title(s):
    result = re.search('BBC Arabic(.*)content', s)
    ret=''
    try:
        ret=result.group(1)
    except:
        ret=''
    return ret

In [249]:
d={}
articles=[]
categories=[]
labels=[]
for i in range(len(data_list)):
    for j in range(len(data_list[i])):
        articles.append(data_list[i][j])
        labels.append(i)
        categories.append(cats[i])

In [250]:
df=pd.DataFrame({
    'article':articles,
    'category':categories,
    'label':labels
})

In [251]:
df['title']=df['article'].apply(extract_title)

In [252]:
df['length']=df['article'].apply(lambda x :len(x))

In [254]:
df=pd.DataFrame(df[df['length']>600])

In [255]:
df['category'].value_counts()

اخبار الشرق الاوسط    2279
اخبار العالم          1484
اقتصاد و اعمال         296
علوم وتكنولوجيا        218
رياضة                  217
منوعات                 121
عرض الصحف               49
Name: category, dtype: int64

In [256]:
df['label'].value_counts()

1    2279
2    1484
4     296
6     218
0     217
5     121
3      49
Name: label, dtype: int64

# get rid of عرض الصحف

In [257]:
df=pd.DataFrame(df[df['label']!=3])

In [258]:
df['label'].value_counts()

1    2279
2    1484
4     296
6     218
0     217
5     121
Name: label, dtype: int64

# Down-sample categories 1 and 2 to balance the categories

In [259]:
s1=df[df['label']==1].sample(300)
s2=df[df['label']==2].sample(300)

In [260]:
df=pd.DataFrame(df[(df['label']!=1) &(df['label']!=2)  ])

In [264]:
df=df.append(s1)
df=df.append(s2)

In [268]:
df['label'].value_counts()

2    300
1    300
4    296
6    218
0    217
5    121
Name: label, dtype: int64

# Now the data is balanced, let's train the model

In [272]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm

In [274]:
articles=np.array(df['article'])
labels=np.array(df['label'])
x_train,x_test,y_train,y_test=train_test_split(articles,labels,test_size=0.2)

tf_vectorizer=TfidfVectorizer(min_df=4,max_df=0.3,ngram_range=(1,3))
x_train_tfidf=tf_vectorizer.fit_transform(x_train)
x_test_tfidf=tf_vectorizer.transform(x_test)

In [275]:
clf = svm.LinearSVC()
clf.fit(x_train_tfidf, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [276]:
clf.score(x_test_tfidf,y_test)


0.9072164948453608

# Accuracy on the test set is 90%

In [277]:
from sklearn.model_selection import cross_validate

In [278]:
scores = cross_validate(clf,x_train_tfidf, y_train, 
                        cv=5, return_train_score=False)

In [281]:
scores['test_score'].mean()

0.9026392859679607

# Accuracy on cross-validation is 90%