In [9]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/robertadrianbucur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Descriere set de date SMS

Colecția SMS Spam v.1 (denumită în continuare corpus) este un set de mesaje marcate prin SMS care au fost colectate pentru cercetarea SMS Spam.

Conține un set de mesaje SMS în limba engleză de 5.574 de mesaje, conform căruia etichetarea este ham (legitimă) sau spam.

### Exemplu de continut:

0   What you doing?how are you?

0   Ok lar... Joking wif u oni...


0   dun say so early hor... U c already then say...

0   MY NO. IN LUTON 0125698789 RING ME IF UR AROUND! H*

0   Siva is in hostel aha:-.

0   Cos i was out shopping wif darren jus now n i called him 2 ask wat present he wan lor. Then he started guessing who i was wif n he finally guessed darren lor.

1   FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! ubscribe6GBP/ mnth inc 3hrs 16 stop?txtStop

1   Sunshine Quiz! Win a super Sony DVD recorder if you canname the capital of Australia? Text MQUIZ to 82277. B

1   URGENT! Your Mobile No 07808726822 was awarded a L2,000 Bonus Caller Prize on 02/09/03! This is our 2nd attempt to contact YOU! Call 0871-872-9758 BOX95QU

### Problema este una de clasificare binara: mesaj acceptat = 0 si marcate ca spam = 1.

# Incarcarea setului de date

In [50]:
data = pd.read_csv('SMS/SMSSpamCollection.txt', sep='\t', header=None)

data.head()

Unnamed: 0,0,1
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Afisam numarul de sms-uri acceptate si lungimile cuvintelor: minim, maxim, mediu

In [11]:
total_accepted_sms = len(data[data[0]==0])
print('Total accepted sms:', total_accepted_sms)
total_spam_sms = len(data[data[0]==1])
print('Total spam sms:', total_spam_sms)

word_length_data = data.loc[:, 1].apply(lambda x: len(x.split()))
print('Min word length:', word_length_data.min())
print('Max word length:', word_length_data.max())
print('Mean word length:', int((word_length_data.mean())))

Total accepted sms: 4825
Total spam sms: 747
Min word length: 1
Max word length: 171
Mean word length: 15


# Eliminarea stopwords si afisarea numarului de aparitii a ficarui cuvant

In [37]:
countVectorizer = CountVectorizer(stop_words=stopwords.words('english'))
countVectorizer.fit(data.loc[:, 1])
words_frequency = dict(countVectorizer.vocabulary_.items())
data[1] = [text.replace('〨ud', '') for text in data[1]]
data[1] = [text.replace('ú1', '') for text in data[1]]

print(words_frequency)

po19': 5848, '2ez': 448, 'general': 3470, 'jetton': 4252, 'cmon': 2094, 'sticky': 7203, 'replies': 6352, 'lunsford': 4742, 'enjoying': 2912, '0796xxxxxx': 49, 'prizeawaiting': 6026, 'kfc': 4365, 'meals': 4902, 'gravy': 3613, '07008009200': 23, 'attended': 1275, 'mw': 5182, 'tuth': 7861, 'eviction': 2998, 'spiral': 7093, 'michael': 4968, 'riddance': 6424, 'suffers': 7316, 'raglan': 6166, 'edward': 2834, 'cricket': 2328, 'closeby': 2075, 'skye': 6900, 'bookedthe': 1609, 'hut': 3972, 'drastic': 2732, 'indicate': 4083, '3750': 521, 'garments': 3443, 'sez': 6725, 'arab': 1182, 'evry1': 3007, 'eshxxxxxxxxxxx': 2958, 'lay': 4499, 'bimbo': 1529, 'ugo': 7899, '3lions': 532, 'portege': 5904, 'm100': 4758, 'semiobscure': 6681, 'gprs': 3589, 'repeat': 6345, 'loosu': 4680, 'careless': 1866, 'freaking': 3334, 'myspace': 5189, 'logged': 4650, 'method': 4962, 'calculation': 1805, 'blur': 1587, 'clothes': 2081, 'jewelry': 4253, 'breaker': 1678, 'deluxe': 2512, 'features': 3131, 'graphics': 3607, 'bbdel

# Afisarea celui mai frecvent cuvant si afisarea primelor

In [49]:
print('The most frequent word:', [(key, value)  for (key, value) in words_frequency.items() if value == max(words_frequency.values())])

The most frequent word: [('èn', 8572)]


# Afisarea fiecarui sms cu numarul de cuvinte

In [35]:
words_number = data.loc[:, 1].apply(lambda x:len(str(x).split(' ')))
print(np.matrix(list(zip(data[1], words_number))))

[['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
  '20']
 ['Ok lar... Joking wif u oni...' '6']
 ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
  '28']
 ...
 ['Pity, * was in mood for that. So...any other suggestions?' '10']
 ["The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
  '26']
 ['Rofl. Its true to its name' '6']]
