### Part2 Fresh and Clean

In [172]:
import pandas as pd
import numpy as np
from textblob import TextBlob, Word
from langdetect import detect
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
import nltk
import sklearn
import glob
from textblob import TextBlob, Word
import langdetect
import unicodedata
import re
import contractions
import spacy
from nltk.corpus import wordnet
import textblob

In [173]:
nlp = spacy.load('en', parse=False, tag=False, entity=False)
ps = nltk.porter.PorterStemmer()
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')
special_char_pattern = re.compile(r'([{.(-)!}])')

In [174]:
def importdata(path):
    all_files = glob.glob(path + "/*.csv")
    li = []
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df)
    frame = pd.concat(li, axis=0, ignore_index=True)
    del frame['Unnamed: 0']
    frame = frame.dropna()
    return frame

In [175]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', str(text)).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [176]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

In [177]:
def expand_contractions(text):
    return contractions.fix(text)

In [178]:
def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [179]:
def tokenize_frase(text):
    return nltk.sent_tokenize(text)

In [180]:
def tokenize_parola(text):
    return nltk.word_tokenize(text)

In [181]:
def rilevo_filtro_lingua(text):
    try:
        if detect(text):
            return detect(text)
        else:
            return ("Not_detected") 
    except Exception as e:
        return ("Not_detected")

In [182]:
def rimuovo_newline(text):
    return text.translate(text.maketrans("\n\t\r", "   "))

In [183]:
def rimuovo_caratteri_speciali(text):
    return remove_special_characters(special_char_pattern.sub(" \\1 ", text), remove_digits=True) 

In [184]:
def stemming_parola(text, stemmer=ps):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [185]:
REPEAT_PATTERN = re.compile(r'(\w*)(\w)\2')
MATCH_SUBSTITUTION = r'\1\2'

def remove_repeated_characters(word):
    if wordnet.synsets(word):
        return word
    new_word = REPEAT_PATTERN.sub(MATCH_SUBSTITUTION, word)
    return (remove_repeated_characters(new_word) 
               if new_word != word else new_word)

def duplicate_char_remover(sentence):
    tokens = nltk.word_tokenize(sentence)
    final_sentence = ' '.join(remove_repeated_characters(word.lower()) 
                                  for word in tokens)
    return final_sentence

In [186]:
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

### Import data

In [187]:
df_comment= importdata("/Users/simone/Documents/Master_scraping/file_scrpaing/")

In [188]:
df_comment['comment'] = df_comment['comment'].apply(lambda x: x.lower()) 

In [189]:
df_comment['comment'] = df_comment['comment'].apply(lambda x: rimuovo_newline(x)) 

In [190]:
df_comment['no_accent'] = df_comment['comment'].apply(lambda x: remove_accented_chars(x))

In [191]:
df_comment['no_duplicate'] = df_comment['no_accent'].apply(lambda x: duplicate_char_remover(x))

In [192]:
df_comment['no_contracted'] = df_comment['no_duplicate'].apply(lambda x: expand_contractions(x))

In [193]:
df_comment['no_special'] = df_comment['no_contracted'].apply(lambda x: rimuovo_caratteri_speciali(x))

In [194]:
df_comment['language'] = df_comment['no_special'].apply(lambda x: rilevo_filtro_lingua(x))

In [195]:
df_comment = df_comment.loc[df_comment['language'] == 'en']

In [196]:
df_comment['stemming'] = df_comment['no_special'].apply(lambda x: stemming_parola(x))

In [197]:
df_comment['no_stop_words'] = df_comment['stemming'].apply(lambda x: remove_stopwords(x))

In [198]:
df_comment.to_csv("/Users/simone/Documents/Master_scraping/output/preprocessed_comment.csv",sep="\t",header=True,index=False)