# Creating a Tweet Database

Create a sqlLite database that we can use for faster indexing.

**Links:**
- [Twitter Streaming with Python and SQLite, David Schuler](http://www.davidrschuler.com/python_twitter_streaming)
- [Working with SQLite Databases using Python and Pandas, DataQuest](https://www.dataquest.io/blog/python-pandas-databases/)
- [Big Data Analytics with Pandas and SQLite in Python, Plotly](https://plot.ly/ipython-notebooks/big-data-analytics-with-pandas-and-sqlite/)

<br>
```
df.to_sql("table_name", conn, if_exists="replace/append")        # write
pd.read_sql_query("select * from table_name limit 1;", conn)     # read
```

In [10]:
# Libraries

%run utilities.py
import sqlite3

In [11]:
# run two commands to create/clear the database
!rm ../../data/processed/scrape/raw-tweets.db
!touch ../../data/processed/scrape/raw-tweets.db

**Variables and Functions**

In [12]:
# Schemas
raw_schema = """CREATE TABLE IF NOT EXISTS Raw(
  tweetID INTEGER,
  date DATE,
  week DATE,
  username TEXT,
  message TEXT,
  retweet INTEGER,
  longitude REAL,
  latitude REAL,
  followersCount INTEGER,
  friendsCount INTEGER,
  joinDay TEXT,
  favouritesCount INTEGER,
  language TEXT,
  statusesCount INTEGER
);""".replace('\n', '')

def filter_tweet_df(raw_df):
    """
    Formats and returns a dataframe of tweets
    """
    dt_parser = lambda x: pd.to_datetime(x, infer_datetime_format=True) # format="%b-%d-%Y")
    #dt_parser = lambda t: t.split()[1] +'-'+ t.split()[2] +'-'+ t.split()[-1]
    
    # Drop rows that'll give us trouble
    raw_df.dropna(subset=['date', 'longitude', 'latitude', 'message'], how='any', inplace=True)
    raw_df['week'] = raw_df['date'].map(dt_parser)
    
    return raw_df
    
def read_tweets(f):
    conversions = {'retweet': lambda x: 1 if (x=='RT') else x
                  #'date': lambda t: t.split()[1] +'-'+ t.split()[2] +'-'+ t.split()[-1]
                  }
    raw_df = pd.read_csv(f, usecols=cols_all,    # filtering
                         low_memory=False, engine='c', # optimization
                        converters=conversions)        # format attrs
    return raw_df

---

In [13]:
# start
conn = sqlite3.connect(tweet_db_f)
cur = conn.cursor()

In [14]:
# add table to database
cur.execute(raw_schema)
conn.commit()

In [15]:
# add tweets to new DB table 
tweets_list = ls_files_list(external_scrape_dir)

for tweets in tweets_list[1:]:
    tweet_df = read_tweets(tweets)
    slim_df = filter_tweet_df(tweet_df)
    
    slim_df.to_sql("Raw", conn, if_exists="append", index=False)

OperationalError: table Raw has no column named userID

In [None]:
# add changes and close
conn.commit()
conn.close()