# NLP Project: Instagram Corpora 💁🏼‍♀️💅🏻✨

In [28]:
from flask_sqlalchemy import SQLAlchemy
from flask import Flask, render_template, request
import sqlite3

import nltk
import pymorphy2
from razdel import sentenize, tokenize
from instagrapi import Client

import csv
from tqdm import tqdm
from collections import defaultdict

pymorph = pymorphy2.MorphAnalyzer()

## 1. Collecting data from Instagram

In [29]:
def lemmatizer(token):
    p = pymorph.parse(token)[0]
    return p.normal_form

In [30]:
def tokenizer(text):
    return [i for i in nltk.word_tokenize(text) if i.isalpha()]

In [4]:
def get_texts_from_instagram(user_ids_list, file_path, USERNAME='your username', 
                             PASSWORD='your password', number_of_posts=50):
    insta_cl = Client()
    insta_cl.login(USERNAME, PASSWORD)
    
    file = open(file_path, 'a', encoding='utf-8')
    writer = csv.writer(file)
    
    sent_id = 1
    for account in tqdm(user_ids_list):
        user_id = insta_cl.user_id_from_username(account)
        medias = insta_cl.user_medias(user_id, number_of_posts)
        for post in medias:
            sentences = [substring.text.replace('\n', ' ') for substring in sentenize(post.caption_text)]
            for sent in sentences:
                if sent:
                    writer.writerow([sent_id, sent, 'https://www.instagram.com/p/' + post.code])
                    sent_id += 1

In [10]:
# Create a list of insta accounts
user_ids_list = [link.strip() for link in open('instagram_ids.txt', encoding='utf-8')]

In [11]:
get_texts_from_instagram(user_ids_list, 'instagram_texts.csv')

100%|██████████| 10/10 [04:05<00:00, 24.59s/it]


## 2. Create a database

**DB desigh:** 
1. A table with context information: context id, context, metadata.
2. A table with morphological features of tokens: id, context id, token, lemma, pos.

In [31]:
conn = sqlite3.connect('insta_corpus.db')
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS context_metadata 
(id_cm int PRIMARY KEY, 
context text,
metadata text)
""")

cur.execute("""
CREATE TABLE IF NOT EXISTS morphology 
(id_morph int PRIMARY KEY,
context_id int,
token text,
lemma text,
pos text
)
""")

conn.commit()
conn.close()

### Tokenizing texts and collecting morphological information of tokens

In [32]:
with open('instagram_texts.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    sentences = [line for line in reader]

In [33]:
sentences[0]

['1', 'Но давайте честно.', 'https://www.instagram.com/p/CU7q4SBjT8q']

### Filling in the 1st table with context id, context and metadata.

In [34]:
conn = sqlite3.connect('insta_corpus.db')
cur = conn.cursor()

for context_id, context, metadata in sentences:
    cur.execute("INSERT or IGNORE INTO context_metadata VALUES (?,?,?)",
                [context_id, context, metadata]) 
    
conn.commit()
conn.close()

In [40]:
def get_context_metadata_for_unigram(unigram_info):
    matches_list = []
    item = unigram_info[0]
    tag = unigram_info[1]
    
    conn = sqlite3.connect('insta_corpus.db')
    cur = conn.cursor()

    for elem in unigram_info:
        id_num = elem['_id']
        context = elem['context']
        metadata = elem['metadata']
        matches_list.append((context, metadata))
    return matches_list

### Filling in the 2nd table with token id, token, lemma, pos, context and metadata.

A function which outputs a dictionary with token id, token, lemma, pos, context and metadata.

In [37]:
def pymorphy_token_analysis(token, id_num, context_id, context, metadata):
    p = pymorph.parse(token)[0]
    token_info = defaultdict()
    token_info['id'] = id_num
    token_info['token'] = token
        
    if p.tag.POS == 'INFN':
        token_info['POS'] = 'verb'
        token_info['inf'] = 'inf'

    elif p.tag.POS in ['PRTF', 'PRTS']:
        token_info['POS'] = 'verb'
        token_info['partcp'] = 'partcp'
            
    elif p.tag.POS in ['ADJF', 'ADJS']:
        token_info['POS'] = 'adj'
        
    elif p.tag.POS == 'GRND':
        token_info['POS'] = 'verb'
        token_info['ger'] = 'ger'
    else:
        try:
            token_info['POS'] = p.tag.POS.lower()
        except AttributeError:
            token_info['POS'] = None
    
    token_info['lemma'] = pymorph.parse(token)[0].normal_form
    token_info['context_id'] = context_id
    token_info['context'] = context
    token_info['metadata'] = metadata
        
    return token_info

In [38]:
conn = sqlite3.connect('insta_corpus.db')
cur = conn.cursor()

id_num = 1
for sent in sentences:
    context_id = sent[0]
    context = sent[1]
    metadata = sent[2]
    for token in tokenizer(context):
        token_dict = pymorphy_token_analysis(token, id_num, context_id, context, metadata)
        cur.execute("INSERT or IGNORE INTO morphology VALUES (?,?,?,?,?)",
                [token_dict['id'], token_dict['context_id'], token_dict['token'], 
                 token_dict['lemma'], token_dict['POS']])
        id_num += 1

conn.commit()
conn.close()