# Task for Today  

***

## Twitter User Gender Prediction  

Given *data about users on Twitter*, let's try to predict the **gender** of a given user.  
  
We will use a deep recurrent neural network with multiple inputs to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
data = pd.read_csv('../input/twitter-user-gender-classification/gender-classifier-DFE-791531.csv', encoding='latin-1')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
data.isna().mean()

In [None]:
def get_sequences(texts, vocab_length):
    tokenizer = Tokenizer(num_words=vocab_length)
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    
    max_seq_length = np.max([len(sequence) for sequence in sequences])
    
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences

In [None]:
np.int('ED', 16)

In [None]:
def hex_to_decimal(x):
    try:
        return np.int(x, 16)
    except:
        return 0

def get_rgb(colors):
    r = colors.apply(lambda x: hex_to_decimal(x[0:2]))
    g = colors.apply(lambda x: hex_to_decimal(x[2:4]))
    b = colors.apply(lambda x: hex_to_decimal(x[4:6]))
    return r, g, b

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop unnecessary columns
    df = df.drop(['_unit_id', 'name', 'profileimage', 'tweet_id'], axis=1)
    
    # Encode unknown values in the target column as np.NaN
    df['gender'] = df['gender'].replace('unknown', np.NaN)
    
    # Drop rows with missing target values
    gender_nas = df[df['gender'].isna()].index
    df = df.drop(gender_nas, axis=0).reset_index(drop=True)
    
    # Drop columns with over 30% missing values
    missing_cols = df.columns[df.isna().mean() > 0.3]
    df = df.drop(missing_cols, axis=1)
    
    # There are only 50 remaining missing values in the _last_judgment_at columns, so let's drop those rows
    judgment_nas = df[df['_last_judgment_at'].isna()].index
    df = df.drop(judgment_nas, axis=0).reset_index(drop=True)
    
    # Let's encode the missing values in the description column as empty strings
    df['description'] = df['description'].fillna('')
    
    # Create date/time columns
    for column in ['_last_judgment_at', 'created', 'tweet_created']:
        df[column] = pd.to_datetime(df[column])
    
    df['judgment_day'] = df['_last_judgment_at'].apply(lambda x: x.day)
    df['judgment_hour'] = df['_last_judgment_at'].apply(lambda x: x.hour)
    
    df['created_year'] = df['created'].apply(lambda x: x.year)
    df['created_month'] = df['created'].apply(lambda x: x.month)
    df['created_day'] = df['created'].apply(lambda x: x.day)
    df['created_hour'] = df['created'].apply(lambda x: x.hour)
    
    df['tweet_hour'] = df['tweet_created'].apply(lambda x: x.hour)
    
    df = df.drop(['_last_judgment_at', 'created', 'tweet_created'], axis=1)
    
    # Get sequence data for description and text columns
    desc = get_sequences(df['description'], vocab_length=20000)
    tweets = get_sequences(df['text'], vocab_length=20000)
    
    df = df.drop(['description', 'text'], axis=1)
    
    # Drop columns with only one value
    df = df.drop(['_golden', '_unit_state', '_trusted_judgments', 'profile_yn'], axis=1)
    
    # Encode color columns as RGB values
    df['link_red'], df['link_green'], df['link_blue'] = get_rgb(df['link_color'])
    df['side_red'], df['side_green'], df['side_blue'] = get_rgb(df['sidebar_color'])
    
    df = df.drop(['link_color', 'sidebar_color'], axis=1)
    
    # Encode label column
    label_mapping = {'female': 0, 'male': 1, 'brand': 2}
    df['gender'] = df['gender'].replace(label_mapping)
    
    # Split df into X and y
    y = df['gender'].copy()
    X = df.drop('gender', axis=1).copy()
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, desc, tweets, y

In [None]:
X, desc, tweets, y = preprocess_inputs(data)

In [None]:
X

In [None]:
{column: len(X[column].unique()) for column in X.columns}

In [None]:
desc.shape

In [None]:
tweets.shape

In [None]:
y.value_counts()

# Train-Test Split

In [None]:
X_train, X_test, desc_train, desc_test, tweets_train, tweets_test, y_train, y_test = \
    train_test_split(X, desc, tweets, y, train_size=0.7, random_state=1)

# Modeling

In [None]:
desc.shape

In [None]:
def build_model():

    X_inputs = tf.keras.Input(shape=(X.shape[1],))
    desc_inputs = tf.keras.Input(shape=(desc.shape[1],))
    tweet_inputs = tf.keras.Input(shape=(tweets.shape[1],))

    # X
    X_dense1 = tf.keras.layers.Dense(256, activation='relu')(X_inputs)
    X_dense2 = tf.keras.layers.Dense(256, activation='relu')(X_dense1)

    # desc
    desc_embedding = tf.keras.layers.Embedding(
        input_dim=20000,
        output_dim=256,
        input_length=desc.shape[1]
    )(desc_inputs)
    desc_gru = tf.keras.layers.GRU(256, return_sequences=False)(desc_embedding)
    desc_flatten = tf.keras.layers.Flatten()(desc_embedding)
    desc_concat = tf.keras.layers.concatenate([desc_gru, desc_flatten])

    # tweets
    tweet_embedding = tf.keras.layers.Embedding(
        input_dim=20000,
        output_dim=256,
        input_length=tweets.shape[1]
    )(tweet_inputs)
    tweet_gru = tf.keras.layers.GRU(256, return_sequences=False)(tweet_embedding)
    tweet_flatten = tf.keras.layers.Flatten()(tweet_embedding)
    tweet_concat = tf.keras.layers.concatenate([tweet_gru, tweet_flatten])

    concat = tf.keras.layers.concatenate([X_dense2, desc_concat, tweet_concat])

    outputs = tf.keras.layers.Dense(3, activation='softmax')(concat)


    model = tf.keras.Model(inputs=[X_inputs, desc_inputs, tweet_inputs], outputs=outputs)

    return model

In [None]:
model = build_model()

print(model.summary())
tf.keras.utils.plot_model(model)

# Training

In [None]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

batch_size = 32
epochs = 3

history = model.fit(
    [X_train, desc_train, tweets_train],
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    callbacks=[
        tf.keras.callbacks.ModelCheckpoint('./model.h5', save_best_only=True, save_weights_only=True),
        tf.keras.callbacks.ReduceLROnPlateau()
    ]
)

In [None]:
model.load_weights('./model.h5')

# Results

In [None]:
results = model.evaluate([X_test, desc_test, tweets_test], y_test, verbose=0)
print("Model Accuracy: {:.2f}%".format(results[1] * 100))

In [None]:
y_true = np.array(y_test)

y_pred = model.predict([X_test, desc_test, tweets_test])
y_pred = map(lambda x: np.argmax(x), y_pred)
y_pred = np.array(list(y_pred))

In [None]:
cm = confusion_matrix(y_true, y_pred)
clr = classification_report(y_true, y_pred, target_names=['Female', 'Male', 'Brand'])

In [None]:
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='g', cbar=False, cmap='Blues')
plt.xticks(np.arange(3) + 0.5, ['Female', 'Male', 'Brand'])
plt.yticks(np.arange(3) + 0.5, ['Female', 'Male', 'Brand'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
print("Classification Report:\n\n", clr)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/0Jb0ywwLQgI