### 1. Loading and cleaning of data

The data sets are from the website https://travel.stackexchange.com/

The file travel.stackexchange.com.7z (license: cc-by-sa 3.0) was downloaded from https://archive.org/details/stackexchange as XML files, each represents another data frame.

The data was:
- imported using Python,
- cleared - numerical data in the appropriate int or float types, dates in the datetime type, etc.
- transformed into pandas data frames,
- exported to a CSV file.

The full code needed to complete the above steps is provided below.

In [1]:
# import of required packages

import pandas as pd
import numpy as np
from copy import deepcopy
import io, os.path
from lxml import etree as et
import csv


# define a function that creates a dictionary for loaded tables

def dict_fun(root):
    root_attrib = root.attrib
    for tab in root:
        tab_dict = deepcopy(root_attrib)
        attrib_dict = {}
        attrib_dict.update(tab.attrib)
        for key, value in attrib_dict.items():
            attrib_dict.update({key:value})
        tab_dict.update(attrib_dict)
        yield tab_dict
        
# load the XML files, convert to pandas data frames using the dict_fun function and save in the form of CSV

for frame in ['Badges','Comments','PostHistory','PostLinks','Posts','Tags','Users','Votes']:
    link = os.path.join(r'.\Travel_stack_exchange',frame)  
    tree = et.parse(link + '.xml')
    root = tree.getroot()
    tab_list = list(dict_fun(root))
    df = pd.DataFrame(tab_list)
    df = df.replace(r'\\n', ' ', regex=True)
    df = df.replace(r'\\r', ' ', regex=True)
    df.to_csv(link + ".csv", sep=';', index=False)
    df = pd.read_csv(link + ".csv", sep=';')

In [2]:
# load data frames from CSV files

Badges_df = pd.read_csv(r'.\Travel_stack_exchange\Badges.csv', sep=';')
Comments_df = pd.read_csv(r'.\Travel_stack_exchange\Comments.csv', sep=';')
PostHistory_df = pd.read_csv(r'.\Travel_stack_exchange\PostHistory.csv', sep=';')
PostLinks_df = pd.read_csv(r'.\Travel_stack_exchange\PostLinks.csv', sep=';')
Posts_df = pd.read_csv(r'.\Travel_stack_exchange\Posts.csv', sep=';')
Tags_df = pd.read_csv(r'.\Travel_stack_exchange\Tags.csv', sep=';')
Users_df = pd.read_csv(r'.\Travel_stack_exchange\Users.csv', sep=';')
Votes_df = pd.read_csv(r'.\Travel_stack_exchange\Votes.csv', sep=';')

In [3]:
# calling of data frames

Badges_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136869 entries, 0 to 136868
Data columns (total 6 columns):
Class       136869 non-null int64
Date        136869 non-null object
Id          136869 non-null int64
Name        136869 non-null object
TagBased    136869 non-null bool
UserId      136869 non-null int64
dtypes: bool(1), int64(3), object(2)
memory usage: 5.4+ MB


In [4]:
# we want the column displaying dates to be of the datetime type
Badges_df.Date = pd.to_datetime(Badges_df.Date)
Badges_df.head(1)

Unnamed: 0,Class,Date,Id,Name,TagBased,UserId
0,3,2011-06-21 20:16:48.910,1,Autobiographer,False,2


In [5]:
Badges_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136869 entries, 0 to 136868
Data columns (total 6 columns):
Class       136869 non-null int64
Date        136869 non-null datetime64[ns]
Id          136869 non-null int64
Name        136869 non-null object
TagBased    136869 non-null bool
UserId      136869 non-null int64
dtypes: bool(1), datetime64[ns](1), int64(3), object(1)
memory usage: 5.4+ MB


In [6]:
Comments_df.CreationDate = pd.to_datetime(Comments_df.CreationDate)
Comments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193971 entries, 0 to 193970
Data columns (total 7 columns):
CreationDate       193971 non-null datetime64[ns]
Id                 193971 non-null int64
PostId             193971 non-null int64
Score              193971 non-null int64
Text               193971 non-null object
UserDisplayName    4673 non-null object
UserId             190175 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 10.4+ MB


In [7]:
Comments_df.head(1)

Unnamed: 0,CreationDate,Id,PostId,Score,Text,UserDisplayName,UserId
0,2011-06-21 20:25:14.257,1,1,0,To help with the cruise line question: Where a...,,12.0


In [8]:
PostHistory_df.CreationDate = pd.to_datetime(PostHistory_df.CreationDate)
PostHistory_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313315 entries, 0 to 313314
Data columns (total 9 columns):
Comment              157669 non-null object
CreationDate         313310 non-null datetime64[ns]
Id                   313315 non-null int64
PostHistoryTypeId    313310 non-null float64
PostId               313310 non-null float64
RevisionGUID         313310 non-null object
Text                 278411 non-null object
UserDisplayName      6402 non-null object
UserId               281374 non-null float64
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 21.5+ MB


In [9]:
PostHistory_df.head(1)

Unnamed: 0,Comment,CreationDate,Id,PostHistoryTypeId,PostId,RevisionGUID,Text,UserDisplayName,UserId
0,,2011-06-21 20:19:34.730,1,2.0,1.0,1e04af17-3bdb-4263-aa46-97ee7fb1b0b6,My finance and myself are looking for a good C...,,9.0


In [10]:
PostLinks_df.CreationDate = pd.to_datetime(PostLinks_df.CreationDate)
PostLinks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19812 entries, 0 to 19811
Data columns (total 5 columns):
CreationDate     19812 non-null datetime64[ns]
Id               19812 non-null int64
LinkTypeId       19812 non-null int64
PostId           19812 non-null int64
RelatedPostId    19812 non-null int64
dtypes: datetime64[ns](1), int64(4)
memory usage: 774.0 KB


In [11]:
PostLinks_df.head(1)

Unnamed: 0,CreationDate,Id,LinkTypeId,PostId,RelatedPostId
0,2011-06-21 20:49:19.747,168,1,28,25


In [12]:
Posts_df.ClosedDate = pd.to_datetime(Posts_df.ClosedDate)
Posts_df.CommunityOwnedDate = pd.to_datetime(Posts_df.CommunityOwnedDate)
Posts_df.CreationDate = pd.to_datetime(Posts_df.CreationDate)
Posts_df.LastActivityDate = pd.to_datetime(Posts_df.LastActivityDate)
Posts_df.LastEditDate = pd.to_datetime(Posts_df.LastEditDate)
Posts_df.info()
# Brak kolumny DeletionDate, zgodnie z meta.stackexchange.com/question/2677/database-schema-documentation-for-the-public-data-dump-and-sede\
# ta kolumna nie występuje dla data dump. Zatem ramka danych jest poprawna.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87956 entries, 0 to 87955
Data columns (total 21 columns):
AcceptedAnswerId         13490 non-null float64
AnswerCount              31843 non-null float64
Body                     87659 non-null object
ClosedDate               7215 non-null datetime64[ns]
CommentCount             87956 non-null int64
CommunityOwnedDate       290 non-null datetime64[ns]
CreationDate             87956 non-null datetime64[ns]
FavoriteCount            7664 non-null float64
Id                       87956 non-null int64
LastActivityDate         87956 non-null datetime64[ns]
LastEditDate             50123 non-null datetime64[ns]
LastEditorDisplayName    1238 non-null object
LastEditorUserId         49424 non-null float64
OwnerDisplayName         2613 non-null object
OwnerUserId              85879 non-null float64
ParentId                 52013 non-null float64
PostTypeId               87956 non-null int64
Score                    87956 non-null int64
Tags     

In [13]:
Posts_df.head(1)

Unnamed: 0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id,LastActivityDate,...,LastEditorDisplayName,LastEditorUserId,OwnerDisplayName,OwnerUserId,ParentId,PostTypeId,Score,Tags,Title,ViewCount
0,393.0,4.0,<p>My fiancée and I are looking for a good Car...,2013-02-25 23:52:47.953,4,NaT,2011-06-21 20:19:34.730,,1,2012-05-24 14:52:14.760,...,,101.0,,9.0,,1,8,<caribbean><cruising><vacations>,What are some Caribbean cruises for October?,443.0


In [14]:
Tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1784 entries, 0 to 1783
Data columns (total 5 columns):
Count            1784 non-null int64
ExcerptPostId    1714 non-null float64
Id               1784 non-null int64
TagName          1784 non-null object
WikiPostId       1714 non-null float64
dtypes: float64(2), int64(2), object(1)
memory usage: 69.8+ KB


In [15]:
Tags_df.head(1)

Unnamed: 0,Count,ExcerptPostId,Id,TagName,WikiPostId
0,116,2138.0,1,cruising,2137.0


In [16]:
Users_df.CreationDate = pd.to_datetime(Users_df.CreationDate)
Users_df.LastAccessDate = pd.to_datetime(Users_df.LastAccessDate)
Users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59328 entries, 0 to 59327
Data columns (total 13 columns):
AboutMe            16928 non-null object
AccountId          59325 non-null float64
CreationDate       59328 non-null datetime64[ns]
DisplayName        59325 non-null object
DownVotes          59328 non-null int64
Id                 59328 non-null int64
LastAccessDate     59328 non-null datetime64[ns]
Location           20000 non-null object
ProfileImageUrl    38425 non-null object
Reputation         59328 non-null int64
UpVotes            59328 non-null int64
Views              59328 non-null int64
WebsiteUrl         11751 non-null object
dtypes: datetime64[ns](2), float64(1), int64(5), object(5)
memory usage: 5.9+ MB


In [17]:
Users_df.head(1)

Unnamed: 0,AboutMe,AccountId,CreationDate,DisplayName,DownVotes,Id,LastAccessDate,Location,ProfileImageUrl,Reputation,UpVotes,Views,WebsiteUrl
0,"<p>Hi, I'm not really a person.</p>\r\n\r\n<p>...",-1.0,2011-06-21 15:16:44.253,Community,16235,-1,2011-06-21 15:16:44.253,on the server farm,,1,3160,0,http://meta.stackexchange.com/


In [18]:
Votes_df.CreationDate = pd.to_datetime(Votes_df.CreationDate)
Votes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698143 entries, 0 to 698142
Data columns (total 6 columns):
BountyAmount    2633 non-null float64
CreationDate    698143 non-null datetime64[ns]
Id              698143 non-null int64
PostId          698143 non-null int64
UserId          15483 non-null float64
VoteTypeId      698143 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 32.0 MB


In [19]:
Votes_df.head(1)

Unnamed: 0,BountyAmount,CreationDate,Id,PostId,UserId,VoteTypeId
0,,2011-06-21,1,1,,2


The entire first part code is executed in about 65 seconds.