In [78]:
###### Pre-Processing Function 
import re
from textblob import Word
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
stop = stopwords.words('english')

def pre_process(name,s=False,l=False):
    #1. replace , ;  with a whitespace
    
    #2. remove . " ' ( ) 
    name = re.sub("[\"\'.()\[\]\{\}<>`\?\!#\$%^\&=@]", " ", name)
    name = re.sub('https?://[A-Za-z0-9./]+','',name)
    
    
    name = re.sub("[,;:\+/\\\*~\|]", " ", name)
    
    
    
    #3. replace multiple spaces with single space
    name = " ".join(name.strip().split())
    
    #4. convert string to lower case
    name = name.lower()
    
    
    #5. Removal of stop words
    name =  " ".join(x for x in name.split() if x not in stop)
    
    #6. Stemming
    if (s):
        name = " ".join([st.stem(name) for name in x.split()])
        
    
    #7. Lemmatization
    if (l):
        name = " ".join([Word(word).lemmatize() for word in name.split()])
        
        
    return name

[nltk_data] Downloading package wordnet to
[nltk_data]     /home1/shubhamg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [79]:
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer

def TF_IDF(data,name,char=False):
    
    word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        stop_words='english',
        ngram_range=(1, 2),
        max_features=100000,
        use_idf=False)
    word_vectorizer.fit(data[name])
    
    features_word = word_vectorizer.transform(data[name])
    a = list(word_vectorizer.vocabulary_.keys())
    word_vol = ["W_" + s for s in a]

    if (char):
        char_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            analyzer='char',
            stop_words='english',
            ngram_range=(2, 4),
            max_features=50000,
            use_idf=False)
        char_vectorizer.fit(data[name])
        features_char = char_vectorizer.transform(data[name])
        
        features = hstack([features_word, features_char])
        
        b = list(char_vectorizer.vocabulary_.keys())
        char_vol = ["C_" + s for s in b]
        feature_cols = word_vol + char_vol
    else: 
        features = features_word
        feature_cols = word_vol
        
    return features,word_vectorizer, feature_cols

In [82]:
def Emoticons_Positive(name):
    pos_count = 0
    pos_emoticons = [":-)",":)","(:","(-:","<3",":*",":-D",":D","X-D","XD","xD",
                      ";-)",";)",";-D",";D","(;","(-;"]
    for emoticons in pos_emoticons:
        pos_count+=name.count(emoticons)
    return pos_count

def Emoticons_Negative(name):
    neg_count = 0
    neg_emoticons = [":-(",":(",
                      ":,(",":'(",":(("]
    for emoticons in neg_emoticons:
        neg_count+=name.count(emoticons)
    return neg_count

def negative_comments(name):
    neg_count = 0
    name = name.lower()
    neg_comments = ["no", "not", "doesn't", "does not", "don't"]
    for comments in neg_comments:
        neg_count+=name.count(comments)
    return neg_count


def Punc_help(name):
    help_count = 0
    #punc_help = ["..."]
    help_count=name.count("...")
    help_count=help_count - name.count("....")
        

    return help_count

In [120]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

train_full = pd.read_csv('/data1/shubham.gupta/competitions/Linguipedia/data/train.csv')
test = pd.read_csv('/data1/shubham.gupta/competitions/Linguipedia/data/test.csv')

#train = train_full.iloc[:6000]
#test = train_full.iloc[6000:]

train['tweet_pre'] = train['tweet'].apply(pre_process,l=True)
test['tweet_pre'] = test['tweet'].apply(pre_process,l=True)
frames = [train[['id','tweet_pre']], test[['id','tweet_pre']]]
df = pd.concat(frames)

features,word_vectorizer,feature_cols = TF_IDF(df,'tweet_pre',char=True)
#features,word_vectorizer,feature_cols = TF_IDF(df,'tweet_pre')
features = pd.DataFrame(features.todense())
features.columns = feature_cols

frames_1 = pd.concat([train[['id','tweet','tweet_pre']], test[['id','tweet','tweet_pre']]])
frames_1.index = range(0,len(frames_1))
special_char = ['&',"\$","@","#","\*"]
features["\$&@\*#_Cnt"] = frames_1.tweet.str.count("\$&@\*#")
for spe_chr in special_char:
    col_name = spe_chr + "_Cnt"
    features[col_name] = frames_1.tweet.str.count(spe_chr)
    features[col_name] = features[col_name] - features["\$&@\*#_Cnt"]
    #features[col_name] = np.where(features[col_name]>0,1,0)

features["insta_Cnt"] = frames_1.tweet.str.count("instagram")
features["http_Cnt"] = frames_1.tweet.str.count("http")
features['Post_Cnt'] = frames_1['tweet'].apply(Emoticons_Positive)
features['Neg_Cnt'] = frames_1['tweet'].apply(Emoticons_Negative)
features['Neg_Comment_Cnt'] = frames_1['tweet'].apply(negative_comments)
features['Help_Cnt'] = frames_1['tweet'].apply(Punc_help)
features['word_Cnt'] = frames_1['tweet_pre'].str.count(" ")
#features["insta&***_Cnt"] = (features['insta_Cnt']+ features['http_Cnt'])*features['\$&@\*#_Cnt']
#features["insta&***_Cnt"] = np.where(features["insta&***_Cnt"]>0,1,0)



train_features = features.iloc[:len(train)]
test_features = features.iloc[len(train):]
target = train.label

model = LogisticRegression()
model.fit(train_features, target)
y_pred = model.predict(test_features)

test['label'] = y_pred


In [121]:
from sklearn.metrics import f1_score
f1_score(target, model.predict(train_features), average='weighted') 

0.9430962059274548

In [118]:
y_pred.mean(), target.mean()

(0.28776241679467485, 0.2558080808080808)

In [119]:
test[['id','label']].to_csv('/data1/shubham.gupta/competitions/Linguipedia/submission/06_LR_Emot_BiGram_Abuse_Insta_Neg_Char_SC_http_l1.csv',index=False)