In [18]:
import pandas as pd
import nltk
import re
from collections import OrderedDict
import warnings
warnings.filterwarnings('ignore')

In [19]:
# load dataset
dataset = pd.read_csv('dataset-train.csv')

In [20]:
# print head
dataset.head()

Unnamed: 0,content,programacao orientada a objeto,linguagem de marcacao,banco de dados,linguagem de script
0,"[package com.ifpb.exemplo.dao;, , import com.i...",1,0,0,0
1,"[package com.ifpb.exemplo.modelo;, , import ja...",1,0,0,0
2,"[package com.ifpb.exemplo.visao;, , import jav...",1,0,0,0
3,"[package com.ifpb.exemplo.dao;, , import java....",1,0,0,0
4,"[package com.ifpb.exemplo.dao;, , import java....",1,0,0,0


In [21]:
# print body 'content'
dataset['content']

0       [package com.ifpb.exemplo.dao;, , import com.i...
1       [package com.ifpb.exemplo.modelo;, , import ja...
2       [package com.ifpb.exemplo.visao;, , import jav...
3       [package com.ifpb.exemplo.dao;, , import java....
4       [package com.ifpb.exemplo.dao;, , import java....
5       [package com.ifpb.exemplo.dao;, , import com.i...
6       [package com.ifpb.exemplo.dao;, , import com.i...
7       [package com.ifpb.exemplo.modelo;, , //QuestÃ£...
8       [package com.ifpb.exemplo.modelo;, , public cl...
9       [package com.ifpb.exemplo.modelo;, , import ja...
10      [package com.ifpb.exemplo.visao;, , import com...
11      [package com.ifpb.dao.dao;, , import com.ifpb....
12      [package com.ifpb.dao.dao;, , import com.ifpb....
13      [package com.ifpb.dao.dao;, , import com.ifpb....
14      [package com.ifpb.dao.modelo;, , import java.t...
15      [package com.ifpb.dao.visao;, , import com.ifp...
16      [package com.ifpb.formasgeometricas.modelo;, ,...
17      [packa

In [22]:
# normalize to lowercase
dataset['content'] = dataset['content'].str.lower()

In [23]:
# remove punctuation with space
dataset['content'] = dataset['content'].str.replace('[^\w\s]', ' ')

In [24]:
# remove numeration with null
dataset['content'] = dataset['content'].str.replace(r'\b\d+\b', '')

In [25]:
def tokenize(rows_text):    
    # load content
    content = rows_text['content']
    # tokenizing
    tokens = nltk.word_tokenize(content)         
    return tokens

# creates a transient 'words' column with the result of the function
dataset['tokenized'] = dataset.apply(tokenize, axis = 1)

In [26]:
def remove_stopwords_portuguese(row):
    stopwords_portuguese = nltk.corpus.stopwords.words('portuguese')
    
    my_list = row['tokenized']
    meaningful_words = [w for w in my_list if not w in stopwords_portuguese]
    return (meaningful_words)

# creates a transient 'with_stopwords_removed' column with the result of the function
dataset['with_stopwords_removed'] = dataset.apply(remove_stopwords_portuguese, axis = 1)

In [27]:
# Remove duplicates inline
def remove_words_duplicates_inline(rows_text):
    # load content
    content = rows_text['with_stopwords_removed']
    
    # creates a key dictionary and returns only the keys in list format, 
    # [...] eliminating duplicates by maintaining the insertion order 
    duplicates_deleted = list(OrderedDict.fromkeys(content).keys())

    return duplicates_deleted

# creates a transient 'words' column with the result of the function
dataset['duplicates_removed'] = dataset.apply(remove_words_duplicates_inline, axis = 1)

In [28]:
# print set with duplicates removed
dataset['duplicates_removed']

0       [package, ifpb, exemplo, dao, import, modelo, ...
1       [package, ifpb, exemplo, modelo, import, java,...
2       [package, ifpb, exemplo, visao, import, java, ...
3       [package, ifpb, exemplo, dao, import, java, ut...
4       [package, ifpb, exemplo, dao, import, java, ut...
5       [package, ifpb, exemplo, dao, import, modelo, ...
6       [package, ifpb, exemplo, dao, import, modelo, ...
7       [package, ifpb, exemplo, modelo, questã, deite...
8       [package, ifpb, exemplo, modelo, public, class...
9       [package, ifpb, exemplo, modelo, import, java,...
10      [package, ifpb, exemplo, visao, import, dao, g...
11      [package, ifpb, dao, import, modelo, pessoa, p...
12      [package, ifpb, dao, import, modelo, pessoa, i...
13      [package, ifpb, dao, import, modelo, pessoa, p...
14      [package, ifpb, dao, modelo, import, java, tim...
15      [package, ifpb, dao, visao, import, pessoadao,...
16      [package, ifpb, formasgeometricas, modelo, pub...
17      [packa

In [29]:
def rejoin_words(row):
    # load content
    my_list = row['duplicates_removed']
    joined_words = (" ".join(my_list))
    return joined_words

# creates a transient 'processed' column with the result of the function
dataset['processed'] = dataset.apply(rejoin_words, axis = 1)

In [30]:
# print columns existents
print (list(dataset))

['content', 'programacao orientada a objeto', 'linguagem de marcacao', 'banco de dados', 'linguagem de script', 'tokenized', 'with_stopwords_removed', 'duplicates_removed', 'processed']


In [31]:
cols_to_drop = ['content', 'tokenized', 'with_stopwords_removed', 'duplicates_removed']
dataset.drop(cols_to_drop, inplace = True, axis = 1)

# convert transient cvs in real file
dataset.to_csv('dataset-processed.csv', index = False)