In [None]:
#importing necessary libraries
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import string
import seaborn
import warnings
warnings.filterwarnings("ignore",category= DeprecationWarning)
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
#Reading the data
data = pd.read_csv('/kaggle/input/reddit-india-flair-detection/datafinal.csv')

dataf = data.copy()
data

# Data Cleaning

#### Dropping unwanted columns.

In [None]:
data.columns

In [None]:
data = data.drop(['score','url','comms_num','author','timestamp'],axis=1)

In [None]:
data.head()

In [None]:
data['title'][0]

In [None]:
data['body'][0]

In [None]:
data['combined_features'][0]

In [None]:
data['comments'][0]

##### Note that combined features column is actually combination text of three columns as title,comments, url and combined_features.

I am dropping that column too for ease of analysis.

In [None]:
data = data.drop(['combined_features'],axis=1)
data.head()

In [None]:
data.info()

In [None]:
data.describe()

There are 18 unique flairs.

In [None]:
data['flair'].unique()

Note that some of the flairs are date values. Let's explore it a bit to see what it is...

In [None]:
data.groupby('flair')['title'].describe()

In [None]:
data.groupby('flair')['id'].describe()

In [None]:
data.groupby('flair')['body'].describe()

In [None]:
data.groupby('flair')['comments'].describe()

Seems like there is no significant details present related to those 'date-time' flairs. So let's drop those entries.

In [None]:
#Dropping the rows corresponding to date-time flairs
f = data['flair'].dropna()
regx = re.compile(r"[\d]{1,2}-[\d]{1,2}-[\d]{4} [\d]{1,2}:[\d]{1,2}")
for __ in f:
    #print(flair)
    x = regx.search(__)
    if x is not None:
        #print(x.group())
        d = data[data.flair == x.group()]
        #print(d)
        data = data.drop(d.index)

In [None]:
data['flair'].unique()

There are some nan values in flair. Exploring it....

In [None]:
data[data['flair'] == np.nan].describe()

##### Note: There are no significant data related to nan flair value. Hence we can drop it.

In [None]:
data = data.dropna(subset=['flair'])

In [None]:
data.info()

##### Combine [title,body and comments] for text-processing

In [None]:
data['text'] = data['title'].astype(str) + data['body'].astype(str) + data['comments'].astype(str)

In [None]:
data_final = data[['flair','id','text']]
data_final.head()

In [None]:
data_final.describe()

In [None]:
data_final.groupby('flair')['text'].describe()

##### Removing Special characters

In [None]:
data_final['text'] = data_final['text'].str.replace("[^a-zA-Z0-9 \n.]"," ")

In [None]:
"""Now we have clean data!!!"""
data_final.head(10) 

# Text Preprocessing

### Removing Punctuation, Stopwords and Tokenization

In [None]:
"""
    1. Removing all punctuation
    2. Removing stop-words
    3. Returns a clean text
"""
def clean_txt(mess):
    
    nonpunc = [char for char in mess if char not in string.punctuation] #list of strings which are non-punc
    
    nonpunc = "".join(nonpunc) #join back to form the whole string
    
    return [word for word in nonpunc.split() if word.lower() not in stopwords.words('english')]    

In [None]:
data_final['text'] = data_final['text'].apply(clean_txt)

### Lemmatization

In [None]:
from nltk import WordNetLemmatizer

le = WordNetLemmatizer()

data_final['text'] = data_final['text'].apply(lambda x : [le.lemmatize(word) for word in x])
data_final['text'] = data_final['text'].apply(lambda x : " ".join(x))
data_final['text']

In [None]:
qwe = data_final['text'].copy()
qwe

### Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
bow_transformer = CountVectorizer(analyzer = clean_txt).fit(data_final['text'])

In [None]:
text_bow = bow_transformer.transform(data_final['text'])

In [None]:
print(f'Shape of sparse matrix is {text_bow.shape}')
print(f'Length of dictionary is {len(bow_transformer.vocabulary_)}')
print(f'Number of non=zero occusrances is {text_bow.nnz}')
sparsity = (text_bow.nnz/(text_bow.shape[0]*text_bow.shape[1]))*100
print('Sparsity :', sparsity)

### TfIdf

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer().fit(text_bow)
text_tfidf = tfidf_transformer.transform(text_bow)

In [None]:
print(f'Shape of tfidf of text is {text_tfidf.shape}')

# Traning a Model (Naive Bayes)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
model = MultinomialNB().fit(text_tfidf,data_final['flair'])

In [None]:
predictions = model.predict(text_tfidf)

# Model Evaluation

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(data_final['flair'],predictions))

# Prediction 

## Pipeline

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_txt)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB()),
])

# Train-test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
text_train, text_test, flair_train, flair_test = train_test_split(data_final['text'],data_final['flair'])

In [None]:
pipeline.fit(text_train,flair_train)

In [None]:
predictions = pipeline.predict(text_test)

In [None]:
print(classification_report(flair_test,predictions))

##### Conclusion: Devised a model which can predict flair if given a text.

### Input 

In [None]:
print(text_test)

### Prediction

In [None]:
ids = [data_final.iloc[int(i)]['id'] for i in text_test.index]
Predicted_df = pd.DataFrame({'ID':ids,'Text':text_test,'PredictedFlair':predictions}).reset_index(drop=True)

In [None]:
Predicted_df