In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

### Read data files

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_combine  = pd.concat([df_train,df_test])


### Data Preprocessing

In [3]:
print("Dataframe Information :")
print("\n",df_combine.info())
print("\nUnique count of Authors: ", df_combine.author.nunique())


df_combine["combine_text_author"] = df_combine["title"]+ " - "+df_combine["text"] + " - " + df_combine["author"]
df_combine.drop(["title", "author"], axis="columns", inplace=True)
df_combine.head()

print("\n",df_combine.isnull().sum())
df_combine.dropna(inplace=True)



Dataframe Information :
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26000 entries, 0 to 5199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      26000 non-null  int64  
 1   title   25320 non-null  object 
 2   author  23540 non-null  object 
 3   text    25954 non-null  object 
 4   label   20800 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 1.2+ MB

 None

Unique count of Authors:  4845

 id                        0
text                     46
label                  5200
combine_text_author    3140
dtype: int64


In [4]:
# df_combine["combine_text_author"] = df_combine["text"] + " - " + df_combine["author"]

vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words = "english", lowercase = True,
                          max_features = 500000)

tfidf_old  = vectorizer.fit(df_combine["combine_text_author"])
tfidf_data = vectorizer.fit_transform(df_combine["combine_text_author"])

### Data split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_data, df_combine['label'], test_size = 0.25)

### Classification Algorithm

In [6]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

print(classification_report(y_test,pred))

              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96      2655
         1.0       0.98      0.92      0.95      1917

    accuracy                           0.96      4572
   macro avg       0.96      0.95      0.96      4572
weighted avg       0.96      0.96      0.96      4572



### Creation of Submit.csv

In [7]:
df_submit = pd.DataFrame()

tf1_new = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words = "english", lowercase = True,
                          max_features = 500000, vocabulary = tfidf_old.vocabulary_)
df_test["combine"] =  df_test["text"] + " - " + df_test["author"] 
df_test.isnull().sum()
df_test.dropna(inplace=True) 
combine_tfidf = tf1_new.fit_transform(df_test["combine"])
label =  clf.predict(combine_tfidf)

df_submit["id"] = df_test["id"]
df_submit["label"] = label
df_submit.to_csv("submit.csv", index=False)