In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

train = pd.read_csv('../input/nlp-getting-started/train.csv')
train.drop('location',axis=1,inplace=True)
train.drop('keyword',axis=1,inplace=True)
#train.shape
train.head()

test = pd.read_csv('../input/nlp-getting-started/test.csv')
test.drop('location',axis=1,inplace=True)
test.drop('keyword',axis=1,inplace=True)
#test.shape
#test.head()

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

import emoji
import unicodedata
from nltk.corpus import wordnet
from emoji.unicode_codes import UNICODE_EMOJI
from textblob import TextBlob, Word

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw')

import warnings
warnings.filterwarnings("ignore")

In [None]:
def clearText(text):
  try:
    # remove URLs
    text = re.sub('https?://[A-Za-z0-9./?&=_]+','',text)
    # hashtags
    text = re.sub('#[A-Za-z0-9]+','',text)
    # mentions
    text = re.sub('@[A-Za-z0-9._-]+','',text)
    # to lower
    text = text.lower()
    # remove pontuation
    text = re.sub(r"[^\w\s]","",text)
    #remove white spaces
    text = " ".join(text.strip().split())
    text = re.sub(r"[\W\s]"," ",text)
    text = re.sub("\n","",text)
  except Exception as e:
    print("clearText error - ", e)

  return text

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    else:          
        return None

def stemmingText(text):
  # get adj, verb, nouns
  textWords = word_tokenize(text)
  pos_tagged = nltk.pos_tag(textWords)
  
  wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))

  # stemming
  stemmer = nltk.stem.SnowballStemmer('english') 
  stemSentence = ""
  for word, tag in wordnet_tagged:
    if (tag is not None):
      stem = stemmer.stem(word)
      stemSentence+=stem
      stemSentence += " "
  stemSentence = stemSentence.strip()
  
  # remove stop words
  words = word_tokenize(stemSentence)
  stopwords = nltk.corpus.stopwords.words('english')
  pals = [word for word in words if not word in stopwords] 
  text = " ".join(pals)
  return text

In [None]:
def runPreprocessing(text):
  text = clearText(text)
  text = stemmingText(str(text))
  return text

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

train_text = train['text']
test_text = test['text']

vectorizer = TfidfVectorizer(preprocessor=runPreprocessing)

x_train = vectorizer.fit_transform(train_text)
x_test = vectorizer.transform(test_text)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.svm import LinearSVC

import imblearn
from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV

In [None]:
scaler = Binarizer() 

x = x_train
y = train['target']

#balance dataset with Synthetic minority oversampling
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(x, y)

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=456)

# Grid Search CV to find optimal hyperparameters
grid={"C":np.logspace(-2,2,5), "penalty":["l1","l2"]}
svm_cv = GridSearchCV(LinearSVC(), grid, cv=cv, scoring = 'f1')

pipeline = Pipeline([('scale',scaler), ('gridsearch', svm_cv)])

pipeline.fit(x_smote, y_smote)

y_pred = pipeline.predict(x_test)

In [None]:
submission=pd.read_csv("../input/nlp-getting-started/test.csv")
submission.drop(["keyword"],axis=1,inplace=True)
submission.drop(["location"],axis=1,inplace=True)
submission.drop(["text"],axis=1,inplace=True)

target_pred = pd.DataFrame(y_pred,columns=["target"])

submission = pd.concat([submission,target_pred],axis=1)
submission 

In [None]:
submission.to_csv("submission.csv",index=False)