In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import warnings 

warnings.simplefilter(action='ignore')

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
test_df = pd.read_csv(r'../input/stress-analysis-in-social-media/dreaddit-test.csv')
test_df.shape

In [None]:
train_df = pd.read_csv(r'../input/stress-analysis-in-social-media/dreaddit-train.csv')
train_df.shape

In [None]:
train_df.head()

In [None]:
train_data = train_df.copy()

In [None]:
train_data.isna().sum()

In [None]:
train_data['label'].value_counts()

In [None]:
label_1_para = str.lower(''.join(train_data[train_data['label'] == 1]['text']))
label_1_para[:1000]

In [None]:
label_0_para = str.lower(''.join(train_data[train_data['label'] == 0]['text']))
label_0_para[:1000]

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
stopwords = set(STOPWORDS)

# Wrodcloud based on Label

In [None]:
wordcloud = WordCloud(width=1000, height=500, stopwords=stopwords).generate(label_1_para)
plt.figure(figsize=(15,15))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
wordcloud1 = WordCloud(width=1000, height=500, stopwords=stopwords).generate(label_0_para)
plt.figure(figsize=(15,15))
plt.imshow(wordcloud1)
plt.axis('off')
plt.show()

# Wordcloud Based on Sentiment

In [None]:
sentiment_pos = str.lower(''.join(train_df[train_df['sentiment'] > 0]['text']))
sentiment_pos[:1000]

In [None]:
sentiment_neg = str.lower(''.join(train_df[train_df['sentiment'] < 0]['text']))
sentiment_neg[:1000]

In [None]:
wordcloud2 = WordCloud(width=1000, height=500, stopwords=stopwords).generate(sentiment_pos)
plt.figure(figsize=(15,15))
plt.imshow(wordcloud2)
plt.axis('off')
plt.show()

In [None]:
wordcloud3 = WordCloud(width=1000, height=500, stopwords=stopwords).generate(sentiment_neg)
plt.figure(figsize=(15,15))
plt.imshow(wordcloud3)
plt.axis('off')
plt.show()

In [None]:
values = train_df['subreddit'].value_counts()
labels = train_df['subreddit'].value_counts().index

fig = px.pie(train_df, names=labels, values=values)
fig.update_layout(title='Distribution of Subreddits', template='plotly_dark')
fig.update_traces(hovertemplate='%{label}: %{value}')
fig.show()

In [None]:
test_df.head(1)

In [None]:
test_df['text'][0]

In [None]:
train_df.head(1)

In [None]:
train_df.drop(['text', 'post_id' , 'sentence_range', 'id', 'social_timestamp'], axis=1, inplace=True)

In [None]:
train_df.head(1)

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, Normalizer
from sklearn.feature_selection import SelectKBest, chi2, RFE, VarianceThreshold, mutual_info_classif, SelectKBest

In [None]:
minmax = MinMaxScaler()
stdscaler = StandardScaler()
norm = Normalizer()

In [None]:
label_enc = LabelEncoder()
train_df['subreddit'] = label_enc.fit_transform(train_df['subreddit'])

In [None]:
X = train_df.drop('label', axis=1)
y = train_df['label']

# Chi score 

In [None]:
num_feats = 30
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
X[chi_feature]

# Variance Method

In [None]:
var_thres = VarianceThreshold(0)
var_thres.fit(X_norm)

In [None]:
cols_selected = X.loc[:, var_thres.get_support()].columns.tolist()
# cols_selected
X[cols_selected]

# Correlation Method 

Pearson Correlation

In [None]:
def correlation(df, thres): #Dataset and threshold value
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > thres:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X, 0.7)
len(corr_features)

In [None]:
corr_features
X[corr_features]

# Mutual  Info Classifier

In [None]:
mutual_cls = mutual_info_classif(X, y)
# mutual_cls

In [None]:
select_20_cols = SelectKBest(mutual_info_classif, k=20)
select_20_cols.fit(X, y)
mutual_cols = X.columns[select_20_cols.get_support()]
mutual_cols

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

In [None]:
y.value_counts().plot(kind='bar')

In [None]:
def best_model(X, y, scaler, algo):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
    
    X_train = np.asarray(X_train)
    X_test = np.asarray(X_test)
    
    y_train = np.asarray(y_train)
    y_test = np.asarray(y_test)
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    model = algo()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    training_score = model.score(X_train, y_train)
    acc = accuracy_score(y_test, predictions)
    con = confusion_matrix(y_test, predictions)
    report = classification_report(y_test, predictions)
    print(f'Training Score: {training_score}')
    print(f'Accuracy Score: {acc}')
    print(f'Confusion Matrix: {con}')
    print(f'Classification Report: {report}')

# Using chi2 features

In [None]:
X_chi = X[chi_feature]
best_model(X_chi, y, minmax, LogisticRegression)

In [None]:
best_model(X_chi, y, minmax, KNeighborsClassifier)

In [None]:
best_model(X_chi, y, minmax, DecisionTreeClassifier)

In [None]:
best_model(X_chi, y, minmax, RandomForestClassifier)

In [None]:
best_model(X_chi, y, minmax, XGBClassifier)

# Variance Features

In [None]:
X_var = X[cols_selected]
best_model(X_var, y, minmax, LogisticRegression)

In [None]:
best_model(X_var, y, minmax, KNeighborsClassifier)

In [None]:
best_model(X_var, y, minmax, DecisionTreeClassifier)

In [None]:
best_model(X_var, y, minmax, RandomForestClassifier)