In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
import re
import string
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score,f1_score, classification_report, log_loss
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

In [None]:
df = pd.read_csv('/kaggle/input/twitter-user-gender-classification/gender-classifier-DFE-791531.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df['gender'].unique()

In [None]:
df['gender'] = [1 if gender == 'male' else 0 for gender in df.gender]

In [None]:
df[['gender', 'description']].count()

In [None]:
data = df[['gender', 'description']]
data.head()

In [None]:
data.isna().sum()

In [None]:
data = data.dropna()

In [None]:
data.isna().sum()

In [None]:
# lets clean the description data
data.description = data.description.str.lower()

In [None]:
#remove the user handles that start with @ using regular expression
data.description = data.description.replace('[@+]', '', regex=True)

In [None]:
#Using regular expressions, remove URLs.a
data.description= data.description.replace(r"(www\.)?[-a-zA-Z0–9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0–9@:%_\+.~#?&//=]*)", "", regex=True)

In [None]:
tweets = data.description.values
tweets[2]

In [None]:
#Using TweetTokenizer from NLTK, tokenize the tweets into individual terms.
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
#tweets1 = []

#for tweet in tweets:    
    #tweets1.append(tokenizer.tokenize(tweet))
    
tweets = [tokenizer.tokenize(tweet) for tweet in data.description]

In [None]:
tweets[:2]

In [None]:
# remove punctuations and special chars

new_tweets =  []
stop_words = set(stopwords.words('english'))
punctuations = '''!()-![]{};:+'"\,<>./?@Ÿ‡Œ£º¦¬ÃŠ©¤€«¢œ®°$%^&*_~#Ã°ÂŸÂ“Â±!!! Ã°ÂŸÂ˜Â™Ã°ÂŸÂ˜ÂŽÃ°ÂŸÂ‘Â'''
for tweet in tweets:
    new_tweets.append([i for i in tweet if not i in punctuations and not i in stop_words])

In [None]:
new_tweets[:2]

In [None]:
# create text again and add it as a new column in df DATA
data['new_description'] = new_tweets
data['new_description'] = [" ".join(desc) for desc in data['new_description'].values]

In [None]:
data.head()

In [None]:
# lets create a word cloud
from wordcloud import WordCloud
all_words = ' '.join([text for text in data['new_description']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# identify X and y
X = data['new_description']
y = data['gender']

In [None]:
y.value_counts()

In [None]:
# split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
#After the initial preprocessing phase, we need to transform the text into a meaningful vector (or array) of numbers. 
#The bag-of-words is a representation of text that describes the occurrence of words within a document
# lets use TFIDF vectorizer for this purpose
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

In [None]:
#fit and apply on train set
trans_X_train = vectorizer.fit_transform(X_train)
#Apply on the test set.
trans_X_test = vectorizer.transform(X_test)

In [None]:
# lets apply different classification models
# logistic regression

lr_df = pd.DataFrame()

lr = LogisticRegression()
lr.fit(trans_X_train, y_train)

lr_pred = lr.predict(trans_X_test)
print('accuracy score with logistic regression: ', accuracy_score(y_test, lr_pred))
print('-------------------------------------------')
print(classification_report(y_test, lr_pred))

lr_df['description'] = X_test
lr_df['actual values'] = y_test
lr_df['predicted values'] = lr_pred


lr_df.head()

In [None]:
# Random forest classififer

rf_df = pd.DataFrame()

rf = RandomForestClassifier()
rf.fit(trans_X_train, y_train)

rf_pred = rf.predict(trans_X_test)
print('accuracy score with random forest classifier: ', accuracy_score(y_test, rf_pred))
print('-------------------------------------------')
print(classification_report(y_test, rf_pred))

rf_df['description'] = X_test
rf_df['actual values'] = y_test
rf_df['predicted values'] = rf_pred


rf_df.head()

In [None]:
# Gaussian Naive Bayes classifier

gnb_df = pd.DataFrame()

gnb = GaussianNB()
gnb.fit(trans_X_train.toarray(), y_train)

gnb_pred = gnb.predict(trans_X_test.toarray())
print('accuracy score with Gaussian Naive Bayes classifier: ', accuracy_score(y_test, gnb_pred))
print('-------------------------------------------')
print(classification_report(y_test, gnb_pred))

gnb_df['description'] = X_test
gnb_df['actual values'] = y_test
gnb_df['predicted values'] = gnb_pred


gnb_df.head()

In [None]:
 #lets try lstm into this
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#txt_len = X_train.apply(lambda x : len(x.split(' ')))
# txt_len = 22, considering max_txt_len = 25
MAX_SEQ_LEN  = 25
DEFAULT_BATCH_SIZE = 128

tokenizer = Tokenizer()

tokenizer.fit_on_texts(X_train)
train_vec = tokenizer.texts_to_sequences(X_train)
test_vec = tokenizer.texts_to_sequences(X_test)

# pad the sequences
train_vec= pad_sequences(train_vec, maxlen=MAX_SEQ_LEN)
test_vec= pad_sequences(test_vec, maxlen=MAX_SEQ_LEN)

# 
print('token count:', len(tokenizer.word_index))
print("token index(max):", train_vec.max())

print('Tweet Before tokenizing:', X_train.values[1])
print('Tweet After tokenizing:', tokenizer.sequences_to_texts([train_vec[1]]))

print('tokenized values sample:', train_vec[1].tolist())


In [None]:
# lets try lstm into this
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding


model = Sequential()
model.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = DEFAULT_BATCH_SIZE, 
                    input_length = MAX_SEQ_LEN))
model.add(LSTM(units=128, return_sequences=True))
#model.add(Dropout(0.02))
model.add(LSTM(units=64))
model.add(Dense(1, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
from keras.callbacks import ModelCheckpoint

# fit the data into model
model.fit(train_vec, y_train, epochs=24, batch_size=32, verbose=2)

In [None]:
print("\n\n****************************\n\n")
#print('Loading Best Model...')
#model.load_weights('./model_1.h5')
predictions = model.predict(test_vec, verbose=1)
print('Validation Loss:', log_loss(y_test, predictions))
print('Test Accuracy', accuracy_score(y_test, predictions))
print('F1 Score:', f1_score(y_test, predictions))
#plot_confusion_matrix(y_test.argmax(axis = 1), predictions.argmax(axis = 1), classes=encoder.classes_)
#plt.show()    

**from above we see that LSTM and Naive Bayes performed very bad, instead Logistic regression and Random Forest performed decent with 70% accuracy**