In [1]:
# loading data into pandas df
import pandas as pd

# Loading for data cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

# Loading for stemming & Lemmatizer
from nltk.stem import SnowballStemmer, WordNetLemmatizer

In [2]:
data5 = pd.read_csv("Data5.csv") 
data6 = pd.read_csv("Data6.csv") 

In [3]:
subset5 = data5[['Summary', 'Actual Sentiment']]
subset5.columns = ['text', 'sentiment']

In [4]:
subset6 = data6[['Summary', 'Sentiment ']]
subset6.columns = ['text', 'sentiment']

In [5]:
subset = pd.concat([subset5, subset6], ignore_index=True)
print(subset.shape)

(4108, 2)


### Sentiment distribution

In [6]:
subset['sentiment'].value_counts()

Neutral      1654
Positive     1464
Negative      880
Negative      109
negative        1
Name: sentiment, dtype: int64

### Label consolidation

In [7]:
def label_corr(text):
    text = text.lower().strip()
    if text == 'nuetral':
        text = 'neutral'
    return text

In [8]:
subset['sentiment'] = subset['sentiment'].astype('str')
print(type(subset['sentiment'][1]))
subset['sentiment'] = [label_corr(i) for i in subset['sentiment']]
print(subset.shape)
subset['sentiment'].value_counts()

<class 'str'>
(4108, 2)


neutral     1654
positive    1464
negative     990
Name: sentiment, dtype: int64

### Remove NaN

In [9]:
subset.dropna(inplace=True)
subset.shape

(4100, 2)

In [10]:
subset['sentiment'].value_counts()

neutral     1647
positive    1463
negative     990
Name: sentiment, dtype: int64

In [11]:
subset['sentiment'].value_counts(normalize=True) * 100

neutral     40.170732
positive    35.682927
negative    24.146341
Name: sentiment, dtype: float64

In [13]:
subset['sentiment'].value_counts()   ### if Still some 'nan' present

neutral     1647
positive    1463
negative     990
Name: sentiment, dtype: int64

In [14]:
data_final = subset[subset['sentiment']!='nan']
data_final.shape

(4100, 2)

In [15]:
data_final['sentiment'].value_counts()    ### Clean Data

neutral     1647
positive    1463
negative     990
Name: sentiment, dtype: int64

In [16]:
data_final['sentiment'].value_counts(normalize=True) * 100

neutral     40.170732
positive    35.682927
negative    24.146341
Name: sentiment, dtype: float64

In [17]:
#data_final['count'] = data_final['text'].str.split().apply(len).value_counts()

In [18]:
data_final.head()

Unnamed: 0,text,sentiment
0,Sky Automation Test Manager Colin Ramsay is ha...,positive
1,TDC will create two new business units OpCo an...,neutral
2,Client is concerned with our delivery.,negative
3,The delivery timelines are very well met as pe...,positive
4,Client satisfaction from the business and BT (...,negative


### Data Cleaning

In [19]:
def sentence_to_words(raw_sentence):
    """This function modifies the sentences to words by removing html marks,
    non-letters etc."""
    
    sent_text = BeautifulSoup(raw_sentence.encode('utf-8'), "lxml").get_text()  # Remove HTML markings
    text = re.sub(r'-\s+', '', sent_text)
    letters = re.sub('[^a-zA-Z]|\n', ' ', text)  # Remove non-letters: all special chars, numbers, puncs etc.
    words = letters.lower().split()
    stop_words = [w for w in stopwords.words('english') if not w in ['no', 'not', 'dont', 'isnt']] ## Negative words remain
    important_words = [w for w in words if not w in stop_words and len(w)>2]
    return ' '.join(important_words)

In [20]:
data_final['clean_text'] = [sentence_to_words(t) for t in list(data_final['text'])]
data_final.head()

Unnamed: 0,text,sentiment,clean_text
0,Sky Automation Test Manager Colin Ramsay is ha...,positive,sky automation test manager colin ramsay happy...
1,TDC will create two new business units OpCo an...,neutral,tdc create two new business units opco netco o...
2,Client is concerned with our delivery.,negative,client concerned delivery
3,The delivery timelines are very well met as pe...,positive,delivery timelines well met per client require...
4,Client satisfaction from the business and BT (...,negative,client satisfaction business stakeholders cont...


### Stemming and Lemmatization

In [21]:
def stemming(text_list):
    stemmer = SnowballStemmer('english')
    return ' '.join([stemmer.stem(w) for w in text_list])

def lemmatization(text_list):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w) for w in text_list])

In [22]:
data_final['stem_text'] = [stemming(each.split()) for each in data_final['clean_text']]
data_final['lemma_text'] = [lemmatization(each.split()) for each in data_final['clean_text']]

data_final.head()

Unnamed: 0,text,sentiment,clean_text,stem_text,lemma_text
0,Sky Automation Test Manager Colin Ramsay is ha...,positive,sky automation test manager colin ramsay happy...,sky autom test manag colin ramsay happi pleas ...,sky automation test manager colin ramsay happy...
1,TDC will create two new business units OpCo an...,neutral,tdc create two new business units opco netco o...,tdc creat two new busi unit opco netco opco cu...,tdc create two new business unit opco netco op...
2,Client is concerned with our delivery.,negative,client concerned delivery,client concern deliveri,client concerned delivery
3,The delivery timelines are very well met as pe...,positive,delivery timelines well met per client require...,deliveri timelin well met per client requir tr...,delivery timeline well met per client requirem...
4,Client satisfaction from the business and BT (...,negative,client satisfaction business stakeholders cont...,client satisfact busi stakehold continu alert ...,client satisfaction business stakeholder conti...


In [24]:
# Check before remove duplicates from final df
print ("The number of records/rows in final df:",data_final.shape[0], "and columns are:",data_final.shape[1])
#print (data_final.columns)

The number of records/rows in final df: 4100 and columns are: 5


In [25]:
data_final=data_final.drop_duplicates(subset={"stem_text","lemma_text"}, keep='first', inplace=False)
data_final.shape

(2789, 5)

In [26]:
# Check After remove duplicates from final df
print ("The number of records/rows in final df:",data_final.shape[0], "and columns are:",data_final.shape[1])

The number of records/rows in final df: 2789 and columns are: 5


In [27]:
data_final.shape

(2789, 5)

In [28]:
data_final.head()

Unnamed: 0,text,sentiment,clean_text,stem_text,lemma_text
0,Sky Automation Test Manager Colin Ramsay is ha...,positive,sky automation test manager colin ramsay happy...,sky autom test manag colin ramsay happi pleas ...,sky automation test manager colin ramsay happy...
1,TDC will create two new business units OpCo an...,neutral,tdc create two new business units opco netco o...,tdc creat two new busi unit opco netco opco cu...,tdc create two new business unit opco netco op...
2,Client is concerned with our delivery.,negative,client concerned delivery,client concern deliveri,client concerned delivery
3,The delivery timelines are very well met as pe...,positive,delivery timelines well met per client require...,deliveri timelin well met per client requir tr...,delivery timeline well met per client requirem...
4,Client satisfaction from the business and BT (...,negative,client satisfaction business stakeholders cont...,client satisfact busi stakehold continu alert ...,client satisfaction business stakeholder conti...


In [29]:
# What is the shape of the dataset?
print("Input data has {} rows and {} columns".format(len(data_final), len(data_final.columns)))

Input data has 2789 rows and 5 columns


In [32]:
# How many spam/ham are there?
print("Out of {} rows, {} are neutral, {} are positive and {} are negative".format(len(data_final),
                                                       len(data_final[data_final['sentiment']=='neutral']),
                                                       len(data_final[data_final['sentiment']=='positive']),
                                                       len(data_final[data_final['sentiment']=='negative'])))

Out of 2789 rows, 1209 are neutral, 1008 are positive and 572 are negative


In [33]:
data_final['sentiment'].value_counts()    ### Clean Data

neutral     1209
positive    1008
negative     572
Name: sentiment, dtype: int64

In [34]:
# How much missing data is there?
print("Number of null in label: {}".format(data_final['sentiment'].isnull().sum()))
print("Number of null in text: {}".format(data_final['text'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0


In [None]:
data_final['SwordCount'] = data_final['stem_text'].apply(lambda x: len(str(x).split(" ")))

In [57]:
data_final['LwordCount'] = data_final['lemma_text'].apply(lambda x: len(str(x).split(" ")))

In [58]:
data_final.head()

Unnamed: 0,text,sentiment,clean_text,stem_text,lemma_text,LwordCount
0,Sky Automation Test Manager Colin Ramsay is ha...,positive,sky automation test manager colin ramsay happy...,sky autom test manag colin ramsay happi pleas ...,sky automation test manager colin ramsay happy...,11
1,TDC will create two new business units OpCo an...,neutral,tdc create two new business units opco netco o...,tdc creat two new busi unit opco netco opco cu...,tdc create two new business unit opco netco op...,42
2,Client is concerned with our delivery.,negative,client concerned delivery,client concern deliveri,client concerned delivery,3
3,The delivery timelines are very well met as pe...,positive,delivery timelines well met per client require...,deliveri timelin well met per client requir tr...,delivery timeline well met per client requirem...,10
4,Client satisfaction from the business and BT (...,negative,client satisfaction business stakeholders cont...,client satisfact busi stakehold continu alert ...,client satisfaction business stakeholder conti...,16


In [59]:
len(data_final['lemma_text'][0])

79

In [63]:
data_final['lemma_text'][2]

'client concerned delivery'