In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy
import collections
import string
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

df = pd.read_csv("train.csv");
df = df.drop(['severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1)
df['num_chars'] = df["comment_text"].apply(len)
df['words'] = df['comment_text'].apply(lambda x: len(x.split()))
df['prop_words'] = df['words']/df['num_chars']
df['capitals'] = df['comment_text'].apply(lambda x: sum (1 for char in x if char.isupper()))
df['prop_capitals'] = df['capitals']/df['num_chars']
df['prop_caps_vs_words'] = df['capitals']/df['words']
df['paragraphs'] = df['comment_text'].apply(lambda x: x.count('\n'))
df['prop_paragraphs'] = df['paragraphs']/df['num_chars']
df['prop_paragraphs_vs_words'] = df['paragraphs']/df['words']
#nltk.download('stopwords')
stopwords =  set(stopwords.words("english"))
df['num_stopwords'] = df['comment_text'].apply(lambda x: sum(x.count(w) for w in stopwords))
df['prop_stopwords']=df['num_stopwords']/df['num_chars']
df['prop_stopwords_vs_words'] = df['num_stopwords']/df['words']

df['exclamation'] = df['comment_text'].apply(lambda x: x.count("!"))
df['prop_exclamation']=df['exclamation']/df['num_chars']
df['prop_exclamation_vs_words'] = df['exclamation']/df['words']

df['question_marks'] = df['comment_text'].apply(lambda x: x.count("?"))
df['prop_question']=df['question_marks']/df['num_chars']
df['prop_question_vs_words'] = df['question_marks']/df['words']

df['punctuation'] = df['comment_text'].apply(lambda x: sum(x.count(w) for w in string.punctuation))
df['prop_punctuation']=df['punctuation']/df['num_chars']
df['prop_punctuation_vs_words'] = df['punctuation']/df['words']

df['unique_words'] = df['comment_text'].apply(lambda x: len(set(w for w in x.split())))
df['prop_unique']=df['unique_words']/df['num_chars']
df['prop_unique_vs_words'] = df['unique_words']/df['words']

repeated_threshold = 10
def num_repeated(text):
    sptext = text.split()
    word_counts = collections.Counter(sptext)
    return sum(count for word, count in sorted(word_counts.items()) if count>repeated_threshold)

df['repeated_words'] = df['comment_text'].apply(lambda x: num_repeated(x))
df['prop_repeated']=df['repeated_words']/df['num_chars']
df['prop_repeated_vs_words'] = df['repeated_words']/df['words']

df['mentions'] = df['comment_text'].apply(lambda x: x.count("User:"))
df['prop_mentions']=df['mentions']/df['num_chars']
df['prop_mentions_vs_words'] = df['mentions']/df['words']

sid = SentimentIntensityAnalyzer()
polarity_scores = df['comment_text'].apply(lambda x: sid.polarity_scores(x))
print(polarity_scores)
df['sentiment_compound'] = [p['compound'] for p in polarity_scores]
df['sentiment_positive'] = [p['pos'] for p in polarity_scores]
df['sentiment_negative'] = [p['neg'] for p in polarity_scores]
df['sentiment_neutral'] = [p['neu'] for p in polarity_scores]
target = df['toxic']
df = df.drop(columns=['id','toxic','comment_text'])
df.to_csv("train_cleaned.csv")
print(df)
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.25, random_state=0)
model = LogisticRegression(class_weight='balanced')
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
score = model.score(x_test, y_test)
print(score)
print(f1_score(y_test,y_pred))


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kjain\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
0         {'neg': 0.0, 'neu': 0.897, 'pos': 0.103, 'comp...
1         {'neg': 0.099, 'neu': 0.743, 'pos': 0.158, 'co...
2         {'neg': 0.083, 'neu': 0.849, 'pos': 0.068, 'co...
3         {'neg': 0.022, 'neu': 0.916, 'pos': 0.062, 'co...
4         {'neg': 0.0, 'neu': 0.663, 'pos': 0.337, 'comp...
5         {'neg': 0.0, 'neu': 0.464, 'pos': 0.536, 'comp...
6         {'neg': 0.531, 'neu': 0.469, 'pos': 0.0, 'comp...
7         {'neg': 0.129, 'neu': 0.773, 'pos': 0.099, 'co...
8         {'neg': 0.109, 'neu': 0.891, 'pos': 0.0, 'comp...
9         {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
10        {'neg': 0.019, 'neu': 0.877, 'pos': 0.104, 'co...
11        {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...
12        {'neg': 0.13, 'neu': 0.673, 'pos': 0.197, 'com...
13        {'neg': 0.12, 'neu': 0.783, 'pos': 0.097, 

0.789311408016444
0.4002854084909026
