In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
df = pd.read_csv(r'../input/just-dance-on-youtube/Just Dance Dataset/justDanceAnalysis.csv')
df.shape

In [None]:
data = df.copy()

In [None]:
data.isna().sum()

In [None]:
data['originalText'][4]

In [None]:
cols = ['expandedText', 'sentiment']
del_cols = [i for i in data.columns if i not in cols] # unnecessary columns

In [None]:
data.drop(del_cols, axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data.drop_duplicates(subset='expandedText', inplace=True, keep='first') # drop all the duplicates present in the dataset

In [None]:
data.shape

In [None]:
data['sentiment'].value_counts()

In [None]:
values = data['sentiment'].value_counts()
labels = data['sentiment'].value_counts().index

fig = px.pie(data, values=values, names=labels, hole=0.3)
fig.update_layout(title='Sentiments Distribution', template='plotly_dark')
fig.update_traces(hovertemplate='%{label} : %{value}')
fig.show()

In [None]:
data = data.reset_index()
data.drop('index', axis=1, inplace=True)

In [None]:
data.head()

In [None]:
pos_df = data[data['sentiment']=='Positive'] # all the positive text's
pos_para = ''.join(pos_df['expandedText'])

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
stopwords = set(STOPWORDS) # set stopwords

In [None]:
pos_wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(pos_para)

In [None]:
plt.figure(figsize = (6, 6), facecolor = None)
plt.imshow(pos_wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
neg_df = data[data['sentiment']=='Negative'] # all the negative text's
neg_para = ''.join(neg_df['expandedText'])

In [None]:
neg_wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(neg_para)

In [None]:
plt.figure(figsize = (6, 6), facecolor = None)
plt.imshow(neg_wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
neu_df = data[data['sentiment']=='Neutral'] # all the neutral text's
neu_para = ''.join(neu_df['expandedText'])

In [None]:
neu_wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(neu_para)

In [None]:
plt.figure(figsize = (6, 6), facecolor = None)
plt.imshow(neu_wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
from nltk.stem.porter import PorterStemmer
import re

In [None]:
corpus = []
 
for i in range(len(data['expandedText'])):
     
    review = re.sub('[^a-zA-Z]', ' ', data['expandedText'][i])
     
    # convert all cases to lower cases
    review = review.lower()
     
    # split to array(default delimiter is " ")
    review = review.split()
     
    # creating PorterStemmer object to
    # take main stem of each word
    ps = PorterStemmer()
     
    # loop for stemming each word
    # in string array at ith row   
    review = [ps.stem(word) for word in review
                if not word in stopwords]
                 
    # rejoin all string array elements
    # to create back into a string
    review = ' '.join(review) 
     
    # append each string to create
    # array of clean text
    corpus.append(review)

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
tfidf = TfidfVectorizer()

In [None]:
X_tt = tfidf.fit_transform(corpus)

In [None]:
X_tt.shape

In [None]:
ros = RandomOverSampler()

In [None]:
list_num = []
# series for sentiments
for i in range(len(data['sentiment'])):
    if data['sentiment'][i] == 'Positive':
        list_num.append(1)
    elif data['sentiment'][i] == 'Negative':
        list_num.append(0)
    else:
        list_num.append(2)

In [None]:
data['sentiment_num'] = list_num

In [None]:
data.head()

In [None]:
y  = data['sentiment_num']

In [None]:
X_train_res, y_train_res = ros.fit_resample(X_tt, y)

In [None]:
y_train_res.value_counts()

In [None]:
X_train_res.shape

In [None]:
log = LogisticRegression()

In [None]:
grid = {"C":10.2**np.arange(-2, 3), "penalty":["l1", "l2"]}
clf = GridSearchCV(estimator=log, param_grid=grid, cv=5, n_jobs=-1, scoring="f1_macro")

In [None]:
clf.fit(X_train_res, y_train_res)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tt, y, random_state=0, shuffle=True)

In [None]:
pred = clf.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
accuracy_score(y_test, pred)