In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
import warnings     #To ignore warnings
warnings.filterwarnings("ignore")
twitter_data_full=pd.read_csv("Twitter-data.csv",encoding="ISO-8859–1")


## EXPLORATORY DATA ANALYSIS

In [None]:
twitter_data_full.columns 

In [None]:
#Checking for duplicate rows
twitter_data_full.duplicated().sum()

In [None]:
twitter_data_full.info()      

In [None]:
twitter_data_full.head()

In [None]:
pd.options.display.float_format='{:20,.2f}'.format #Change from scientific notation 

In [None]:
#Checking for unique _unit_id
twitter_data_full._unit_id.nunique()

In [None]:
twitter_data_full.describe()

In [None]:
twitter_data_full.gender.value_counts() 

In [None]:
#Separating into male and female data
female_data_full=twitter_data_full.loc[twitter_data_full["gender"]=="female"]
male_data_full=twitter_data_full.loc[twitter_data_full["gender"]=="male"]

In [None]:
#To See Distribution Link color in female gender
link_female=pd.Series(female_data_full.link_color.value_counts())[:10]

In [None]:
plt.figure(figsize=(10,8))
sb.barplot(x=link_female.index,y=link_female.values)
plt.title("Top 10 Link colors Used By Female")
plt.xlabel("Link color")
plt.ylabel("Frequency")

In [None]:
#To see distribution of link color in male gender
link_male=pd.Series(male_data_full.link_color.value_counts())[:10]


In [None]:
plt.figure(figsize=(10,8))
sb.barplot(x=link_male.index,y=link_male.values)
plt.title("Top 10 Link colors Used By Male")
plt.xlabel("Link color")
plt.ylabel("Frequency")

In [None]:
#To see distribuition of side-bar color in female gender
sidebar_female=pd.Series(female_data_full.sidebar_color.value_counts())[:10]


In [None]:
plt.figure(figsize=(10,8))
sb.barplot(x=sidebar_female.index,y=sidebar_female.values)
plt.title("Top 10 Side-bar colors Used By Female")
plt.xlabel("Side-bar color")
plt.ylabel("Frequency")

In [None]:
#to see the distribution of sidebar color in male gender
sidebar_male=pd.Series(male_data_full.sidebar_color.value_counts())[:10]


In [None]:
plt.figure(figsize=(10,8))
sb.barplot(x=sidebar_male.index,y=sidebar_male.values)
plt.title("Top 10 Side-bar colors Used By Male")
plt.xlabel("Side-bar color")
plt.ylabel("Frequency")

In [None]:
#Selecting columns with data we need
twitter_data=twitter_data_full[["text","description","gender:confidence","gender"]]

In [None]:
twitter_data.head()

In [None]:
#Finding data with 100% gender confidence
full_confidence=(twitter_data["gender:confidence"]==1).sum()

In [None]:
print("Fraction of data having 100% gender confidence= {:.2f}".format(full_confidence/len(twitter_data)))

In [None]:
#Taking only rows with 100% gender confidence and then dropping the gender confidence column
twitter_data=twitter_data.loc[twitter_data["gender:confidence"]==1].drop("gender:confidence", axis=1)

In [None]:
len(twitter_data)

In [None]:
gender_count=twitter_data.gender.value_counts() 

In [None]:
plt.figure(figsize=(10,8))

sb.barplot(x=gender_count.index,y=gender_count.values)
plt.title("Gender Distribution")
plt.xlabel("Gender")
plt.ylabel("Frequency")



In [None]:
twitter_data.gender.value_counts()

In [None]:
twitter_data=twitter_data.loc[twitter_data["gender"]!="unknown"] #Droping rows with gender given as "unknown"

In [None]:
twitter_data.info()

In [None]:
#Fill the missing values in description with "None"
twitter_data["description"]=twitter_data.description.fillna("None")

In [None]:
#Checking for missing values
twitter_data.info()

In [None]:
import re
import nltk


In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords_list=stopwords.words("English") #Making list of stopwords


In [None]:
stopwords_list

In [None]:
#Function for cleaning "text"
def cleaning(text):
    text=re.sub("<[^>]*>"," ",text) #remove html
    text=re.sub("https?://[A-Za-z0-9./]+"," ",text) #remove url
    text=re.sub("_+"," ",text)#remove underscore 
    text=re.sub("[^a-zA-z]"," ",text) #remove symbols and digits
    text=text.lower() #converting to lowercase
    word_list=word_tokenize(text)
    clean_words=[word for word in word_list if not word in stopwords_list ] #removing stopwords 
    text=" ".join(clean_words) #returns clean text
    return text

In [None]:
#Cleaning "text" and "description" columns
twitter_data["clean_text"]=twitter_data["text"].apply(lambda x: cleaning(x))
twitter_data["clean_description"]=twitter_data.description.apply(lambda x: cleaning(x))   


In [None]:
twitter_data

# Which gender makes more typos in their tweets?


In [None]:
#separating the data into male and female data
female_data=twitter_data.loc[twitter_data.gender=="female"]
male_data=twitter_data.loc[twitter_data.gender=="male"]

In [None]:
from spellchecker import SpellChecker


spell=SpellChecker() #for checking spellings/typos

In [None]:
#function for finding typo count in "text"
def find_typos(text):
    text=re.sub("<[^>]*>"," ",text) #remove html
    text=re.sub("https?://[A-Za-z0-9./]+"," ",text) #remove url
    text=re.sub("@[A-Za-z0-9_]+"," ",text)#remove tags          #To avoid counting tags and hashtags as a typo
    text=re.sub("#[A-Za-z0-9_]+"," ",text) #remove hashtags
    text=re.sub("_+"," ",text)#remove underscore 
    text=re.sub("[^a-zA-z]"," ",text) #remove symbols and digits
    text=text.lower() #converting to lowercase
    word_list=text.split() #splitting the text
    typolist=[typo for typo in spell.unknown(word_list) if not typo ==set()]  #list of typos
    return len(typolist) #returns no. of typos in "text" 

In [None]:
#New column that gives typo count in the "text"
female_data["Typo_count"]=female_data.text.apply(lambda x:find_typos(x))
female_data

In [None]:
male_data["Typo_count"]=male_data.text.apply(lambda x:find_typos(x))
male_data

In [None]:
#comparing typos in male and female
import numpy as np
plt.figure(figsize=(10,8))
plt.hist(female_data["Typo_count"],bins=np.arange(0,8),color="yellow",label="Female")
plt.hist(male_data["Typo_count"],bins=np.arange(0,8),color="red",label="Male")
plt.xlabel("Typo counts")
plt.ylabel("Frequency")
plt.legend()
plt.title("Distribution of Typo counts in both genders")


In [None]:
print("Total number of typos made by female",female_data.Typo_count.sum())
print("Total number of typos made by male",male_data.Typo_count.sum())
print("Female gender has more typos in their text")

### Female gender has more typos in their tweet

## What are the most common emotions/words used by Males and Females?

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
lem=WordNetLemmatizer() #To lemmatize the words
from collections import Counter
word_count = Counter() #To count the words

In [None]:

#function to find word count
def wordcounts(df):
    for i in range(len(df)):
        word_list=(str(df.clean_text.values[i])).split(' ')
        for word in word_list:
            lem_word=lem.lemmatize(word) #finding rootword
            word_count[lem_word]+=1
    return word_count.most_common(20) #Returns 20 most common words in the "clean_text" column in the dataframe


In [None]:
# 20 most common words used by female
female_word_counts=wordcounts(female_data)
female_word_counts

In [None]:

top20_words_female=[female_word_counts[i][0] for i in range(20)]
top20_count_female=[female_word_counts[i][1] for i in range(20)]

In [None]:
#plotting 20 most common female words along with frequency
plt.figure(figsize=(15,8))
sb.barplot(x=top20_words_female,y=top20_count_female)
plt.title("Most common words used by Female")
plt.ylabel("Frequency")
plt.xlabel("Words")


In [None]:
# 20 most common words used by male
male_word_counts=wordcounts(male_data)
male_word_counts

In [None]:
top20_words_male=[male_word_counts[i][0] for i in range(20)]
top20_count_male=[male_word_counts[i][1] for i in range(20)]

In [None]:
#plotting 20 most common male words along with frequency
plt.figure(figsize=(15,8))
sb.barplot(x=top20_words_male,y=top20_count_male)
plt.title("Most common words used by Male")
plt.ylabel("Frequency")
plt.xlabel("Words")


In [None]:
from wordcloud import WordCloud, ImageColorGenerator #To generate word cloud


In [None]:
#Generating word clouds
#For female words
total_text_female = " ".join(lem.lemmatize(word) for word in female_data.clean_text)
plt.figure(figsize=(10,10))
wordcloud = WordCloud(max_font_size=50,stopwords=None,relative_scaling=1, max_words=200, background_color="white").generate(total_text_female)
plt.title("FEMALE WORD CLOUD")
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#For male words
total_text_male = " ".join(lem.lemmatize(word) for word in male_data.clean_text)
plt.figure(figsize=(10,10))
wordcloud = WordCloud(max_font_size=50,stopwords=None,relative_scaling=1,max_words=200, background_color="white").generate(total_text_male)
plt.title("MALE WORD CLOUD")
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#Creating New column by concatenating the "clean_description" and "clean_text" columns
twitter_data["Total_text_info"]=twitter_data["clean_text"].str.cat(twitter_data["clean_description"], sep=" ")

In [None]:
twitter_data.head()

In [None]:
twitter_data["Total_text_info"].values[47]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer() #to vectorize text data


In [None]:
 #Vectorize "Total_text_info"     #independent variable    
X=cv.fit_transform(twitter_data["Total_text_info"])

In [None]:
#"gender" is the dependent variable
y=twitter_data["gender"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=23) #Splitting into train and test sets
from sklearn.metrics import accuracy_score  #For Checking Accuracy

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Building Models

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

### MultinomialNB

In [None]:
#Checking accuracy for different alpha values
alpha=[0.01,0.05,0.03,0.1,0.3,0.5,0.6,0.7,0.8,0.9,1]
mb_accuracy={}
for i in alpha:
    mb=MultinomialNB(alpha=i,fit_prior=True)
    mb.fit(X_train,y_train)
    ypred=mb.predict(X_test)
    mb_accuracy[i]=accuracy_score(y_test,ypred)
mb_accuracy=sorted(mb_accuracy.items(),key=lambda x:x[1],reverse=True) #Sorting accuracy in descending order

In [None]:
mb_accuracy

In [None]:
#Training with best model
mb=MultinomialNB(alpha=1,fit_prior=True)
mb.fit(X_train,y_train)
ypred=mb.predict(X_test)
mb_accuracy_final=accuracy_score(y_test,ypred)

### Logistic Regression

In [None]:
#Checking accuracy for different c values
c=[0.01,0.03,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
lg_accuracy={}
for i in c:
    lg=LogisticRegression(multi_class="ovr",C=i,max_iter=200,random_state=23)
    lg.fit(X_train,y_train)
    ypred=lg.predict(X_test)
    lg_accuracy[i]=accuracy_score(y_test,ypred)
lg_accuracy=sorted(lg_accuracy.items(),key=lambda x:x[1],reverse=True) #Sorting accuracy in descending order

In [None]:
lg_accuracy

In [None]:
#Training with best model
lg=LogisticRegression(multi_class="ovr",C=.1,max_iter=200,random_state=23)
lg.fit(X_train,y_train)
ypred=lg.predict(X_test)
lg_accuracy_final=accuracy_score(y_test,ypred)

### Support Vector Machine

In [None]:
#Checking Accuracy for different C values
C=[1,2,3,4,5]
svm_accuracy={}
for i in C:
    svc=SVC(C=i,random_state=23)
    svc.fit(X_train,y_train)
    ypred=svc.predict(X_test)
    svm_accuracy[i]=accuracy_score(y_test,ypred)
svm_accuracy=sorted(svm_accuracy.items(),key=lambda x:x[1],reverse=True) #Sorting accuracy in descending order

In [None]:
svm_accuracy

In [None]:
#Training with best model
svc=SVC(C=2,random_state=23)
svc.fit(X_train,y_train)
ypred=svc.predict(X_test)
svm_accuracy_final=accuracy_score(y_test,ypred)

In [None]:
#Comparison of accuracy of the models

In [None]:
import numpy as np

In [None]:
Models=["MultinomialNB","Logistic Regression","Support Vector Machine"]
Accuracy=[mb_accuracy_final*100, lg_accuracy_final*100,svm_accuracy_final*100]
xpos=np.arange(0,3)
plt.figure(figsize=(10,6))
plt.bar(xpos,Accuracy,color="green")
plt.xticks(xpos,Models)
plt.xlabel("Model")
plt.ylabel("%Accuracy")
plt.title("Comparison Of The Models")


In [None]:
print(" Accuracy for MultinomialNB={:.2f}%".format(Accuracy[0]))
print(" Accuracy for Logistic Regression={:.2f}%".format(Accuracy[1]))
print(" Accuracy for Support Vector Machine={:.2f}%".format(Accuracy[2]))

### MultinomialNB model suits best for the given problem

## Building Ensemble Model

In [None]:
from sklearn.ensemble import VotingClassifier
ensem_model=VotingClassifier(estimators=[('logistic reg', lg),("SVM",svc),("multinomial",mb)], voting='hard') 
ensem_model.fit(X_train,y_train)
ypred=ensem_model.predict(X_test)
Ensemble_accuracy=accuracy_score(y_test,ypred)

In [None]:

print("Ensemble Model Accuracy is {:.2f}%".format(Ensemble_accuracy*100))

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
print("Confusion Matrix\n",confusion_matrix(y_test,ypred))

In [None]:
print("Classification Report\n",classification_report(y_test,ypred))