In [None]:
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import swifter

from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

## 1. Analyze the given data and understand the problem

#### 1.1 Identify what attributes are available for both users and questions and their data types
#### 1.2 Check for missing values
#### 1.3 Check if train questions have only the right answer or if they also contain the uids for those who responded to a specific question
#### 1.4 Check if there are users wich had more than 1 "winner" answer
#### 1.5 Train v.s Test distribution

In [None]:
df_users = pd.read_csv('users.csv')
print(df_users.info())
df_users.head()

In [None]:
df_questions = pd.read_csv('questions_train.csv')
print(df_questions.info())
df_questions.head()

In [None]:
df_questions_tst = pd.read_csv('questions_test.csv')
print(df_questions_tst.info())
df_questions_tst.head()

In [None]:
def show_missing_fields(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns, 'percent_missing': percent_missing})
    print(missing_value_df)

In [None]:
show_missing_fields(df_users)

In [None]:
show_missing_fields(df_questions)

In [None]:
show_missing_fields(df_questions_tst)

#### Change date columns from string to date data type
    Because date formats are inconsistent (some have miliseconds, some do not, I will use only %Y%m%d format

In [None]:
def format_date(datetime_date):
    try:
        # Split for users dates
        date_format, _, _ = datetime_date.split(' ')
    except:
        # Split for questions dates
        date_format, _ = datetime_date.split('T')
        
    return date_format

### Use swifter to optimize the processing speed of apply function

In [None]:
df_users.creation_date = df_users.creation_date.swifter.apply(format_date)
df_users.last_access_date = df_users.last_access_date.swifter.apply(format_date)

df_questions.date = df_questions.date.swifter.apply(format_date)

df_questions_tst.date = df_questions_tst.date.swifter.apply(format_date)


In [None]:
df_users.last_access_date = pd.to_datetime(df_users.last_access_date, format='%Y-%m-%d')
df_users.creation_date = pd.to_datetime(df_users.creation_date, format='%Y-%m-%d')

df_questions.date = pd.to_datetime(df_questions.date, format='%Y-%m-%d')

df_questions_tst.date = pd.to_datetime(df_questions_tst.date, format='%Y-%m-%d')



In [None]:
print('The oldest user creation date is: {}'.format(df_users.creation_date.min()))
print('The most recent user creation date is: {}\n'.format(df_users.creation_date.max()))

print('The oldest question date is: {}'.format(df_questions.date.min()))
print('The most recent question date is: {}\n'.format(df_questions.date.max()))

print('The oldest question date for TEST SET is: {}'.format(df_questions_tst.date.min()))
print('The most recent question date for TEST SET is: {}'.format(df_questions_tst.date.max()))

#### When I will recommend the top 20 users for a question, I must take into account to not recommend users which did not had an existing account when the question was asked.

In [None]:
total_questions = len(df_questions)
total_unique_questions = len(df_questions.question_id)
unique_user_answers = len(df_questions.accepted_answer_id)
ownwer_answers_own_question = len(df_questions[df_questions.owner_user_id == df_questions.accepted_answer_id])

print('Number of questions is: {}'.format(total_questions))
print('Number of unqiue accepted answer ids is: {}'.format(unique_user_answers))
print('Number of unique questions is {}'.format(total_unique_questions))
print('Numbers of questions where the owner answered to his own question is : {}'.format(ownwer_answers_own_question))

### Check if count of words distribution is the same for both train and test 

In [None]:
train_qs = df_questions.text
test_qs = df_questions_tst.text

# Use a naive word tokenizer, using only space char to create tokens for speed and simplicity
word_train = train_qs.apply(lambda x: len(x.split(' ')))
word_test = test_qs.apply(lambda x: len(x.split(' ')))

In [None]:
plt.figure(figsize=(15, 10))
plt.hist(word_train, bins=50, range=[0, 50], color='black', density=True, label='train')
plt.hist(word_test, bins=50, range=[0, 50], color='blue',alpha=0.5, density=True, label='test')
plt.title('Words probability distributions for question body')
plt.legend()
plt.show()

In [None]:
train_titles = df_questions.title
test_titles = df_questions_tst.title

# Use a naive word tokenizer, using only space char to create tokens for speed and simplicity
title_words_train = train_titles.apply(lambda x: len(x.split(' ')))
title_words_test = test_titles.apply(lambda x: len(x.split(' ')))

In [None]:
plt.figure(figsize=(15, 10))
plt.hist(title_words_train, bins=50, range=[0, 50], color='blue', density=True, label='train')
plt.hist(title_words_test, bins=50, range=[0, 50], color='red',alpha=0.7, density=True, label='test')
plt.legend()
plt.title('Words probability distributions for question title')
plt.show()

In [None]:
cloud = WordCloud(width = 1200, height= 900).generate(" ".join(df_questions.text.sample(10000)))

plt.figure(figsize=(20, 15))
plt.imshow(cloud)
plt.axis('off')
plt.show()

In [None]:
cloud_tst = WordCloud(width = 1200, height= 900).generate(" ".join(df_questions_tst.text.sample(10000)))

plt.figure(figsize=(20, 15))
plt.imshow(cloud_tst)
plt.axis('off')
plt.show()

### Conclusion

#### For each question we only know what is the accepted answer, but we have no data regarding to other possible answers that were given to a specific question.

#### Given this information, it is pretty obvious that a Colaborative Filtering approach will not be suitable for this task.

## 2. Exploratory Data Analysis

#### 2.1 Compute the time spent on platform for each user
#### 2.2 Check distribution for numerical data
#### 3.3 See how numerical data change over time 

In [None]:
df_users['time_spent_days'] = (df_users['last_access_date'] - df_users['creation_date']).dt.days

In [None]:
df_users.head()

In [None]:
def plot_distribution(field):
    plot = plt.plot()
    plt.hist(df_users[field], bins=30, log=True)
    plt.ylabel('Count')
    plt.xlabel(field)
    plt.show()

In [None]:
numeric_fields = ['reputation', 'up_votes', 'down_votes', 'views', 'time_spent_days']

for field in numeric_fields:
    plot_distribution(field)

#### For ploting the relationship between numerical data and time I will use only a sample of data in order to be efficient in terms of speed
#### I will also remove the users which have reputation, up-votes etc. less than 1 

In [None]:
def plot_joint_time(field):
    df_plot = df_users[df_users[field] > 1]
    sample_data = df_plot.sample(5000)
    sns.jointplot(data=sample_data, x='time_spent_days', y=field, kind="scatter")
    plt.show()

In [None]:
for field in numeric_fields:
    if field == 'time_spent_days':
        continue
    plot_joint_time(field)

#### We can see that for each numerical attribute is a tendency to have higher values as the numbers of days grow
#### Due to that, in the data processing phase, I will normalize these values based on time spent on the platform by each user. (A user with 100 reputation after 5 days might be just as good (or better) than a user with 500 reputation after 2 years)

## 3. Data Processing and Encoding
#### 3.1 Fill NaN values
#### 3.2 Encode categorical columns into numericals
#### 3.3 Scale numerical values
#### 3.4 Process text data and encode it

In [None]:
for item in df_users.about_me:
    print(item)