In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer

In [2]:
# load data
train = pd.read_csv(r"C:\Users\shubham\Desktop\HackerEarth\hm_train.csv")
test = pd.read_csv(r"C:\Users\shubham\Desktop\HackerEarth\hm_test.csv")

In [3]:
#Missing value analysis in training data
train_miss=train.isna().sum()
train_miss/len(train)
#No missing values in the train data

#Missing value analysis in test data
test_miss=test.isna().sum()
test_miss/len(test)
#No missing data in the test data either

hmid                 0.0
reflection_period    0.0
cleaned_hm           0.0
num_sentence         0.0
dtype: float64

In [4]:
# function to clean data
stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [5]:
def convert_reflection(per):
    if per=="24h":
        return 24
    else:
        return 3

In [6]:
def convert_pred_cat(cat):
    if cat=="affection":
        return 0
    elif cat=="exercise":
        return 1
    elif cat=="bonding":
        return 2
    elif cat=="achievement":
        return 3
    elif cat=="enjoy_the_moment":
        return 4
    elif cat=="leisure":
        return 5
    elif cat=="nature":
        return 6

In [7]:
#Merging the data for cleaning
test['predicted_category'] = np.nan
merged_data = pd.concat([train, test]).reset_index(drop=True)

In [8]:
#Data Cleaning and Refining
merged_data['cleaned_hm'] = merged_data['cleaned_hm'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))
merged_data['reflection_period']=merged_data['reflection_period'].map(convert_reflection)

In [9]:
merged_data["predicted_category"]=merged_data["predicted_category"].map(convert_pred_cat)

In [10]:
#Creating Count vectorizer
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1), min_df=150, max_features=500)

In [11]:
# create features
bagofwords = countvec.fit_transform(merged_data['cleaned_hm'])

In [12]:
cols=['reflection_period','num_sentence']

In [13]:
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())

In [14]:
# set column names
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]

In [15]:
data_final = pd.concat([merged_data[cols], bow_df], axis=1)

In [16]:
#Splitting the test and train data
train_data_final=data_final[:len(train)]
test_data_final=data_final[len(train):]

In [17]:
target=merged_data['predicted_category'].dropna()

In [18]:
target.head()

0    0.0
1    0.0
2    1.0
3    2.0
4    0.0
Name: predicted_category, dtype: float64

In [19]:
# Applying Logistic regression

reg = linear_model.LogisticRegression()
reg.fit(train_data_final, target)
pred_lr=reg.predict(test_data_final)
print(cross_val_score(reg, train_data_final, target, cv=5, scoring=make_scorer(accuracy_score)))



[0.85356758 0.84874855 0.84482759 0.83884606 0.83293259]


In [20]:
def to_labels(cat):
    if cat==0:
        return "affection"
    elif cat==1:
        return "exercise"
    elif cat==2:
        return "bonding"
    elif cat==3:
        return "achievement"
    elif cat==4:
        return "enjoy_the_moment"
    elif cat==5:
        return "leisure"
    else:
        return "nature"

In [21]:
#Creating the submission files
submission_file = pd.DataFrame({'hmid':test.hmid, 'predicted_category':pred_lr})
submission_file['predicted_category'] = submission_file['predicted_category'].map(lambda x: to_labels(x))
submission_file = submission_file[['hmid', 'predicted_category']]
submission_file.to_csv(r"C:\Users\shubham\Desktop\HackerEarth\submission_file_lr.csv", index=False)

In [23]:
submission_file.head()

Unnamed: 0,hmid,predicted_category
0,88305,bonding
1,88306,achievement
2,88307,affection
3,88308,bonding
4,88309,affection
