<a href="https://colab.research.google.com/github/pstorniolo/Master2021/blob/main/2021_11_06_2_SparkNLP_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

https://www.johnsnowlabs.com/spark-nlp/

# **Translate text**

### Spark NLP documentation and instructions:
https://nlp.johnsnowlabs.com/docs/en/quickstart

### You can find details about Spark NLP annotators here:
https://nlp.johnsnowlabs.com/docs/en/annotators

### You can find details about Spark NLP models here:
https://nlp.johnsnowlabs.com/models


##**Colab** Setup

In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip -q install --ignore-installed pyspark==3.0.3

# Install Spark NLP
! pip -q install --ignore-installed spark-nlp==3.1.0

##Start the Spark session

Import dependencies and start Spark session.

In [None]:
import json
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

sparknlp.start()
spark = SparkSession.builder.getOrCreate()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

##Select the DL model

###Translate Italian text to English

In [None]:
text = "La Gioconda è un dipinto ad olio del XVI secolo creato da Leonardo. Si conserva al Louvre di Parigi."


###Define Spark NLP pipeline

In [None]:
documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

## More accurate Sentence Detection using Deep Learning
sentencerDL = SentenceDetectorDLModel().pretrained("sentence_detector_dl", "xx").setInputCols(["document"]).setOutputCol("sentences")

marian = MarianTransformer.pretrained("opus_mt_it_en", "xx").setInputCols(["sentences"]).setOutputCol("translation")

nlp_pipeline = Pipeline(stages=[
    documentAssembler, 
    sentencerDL, 
    marian
])

###Run the pipeline

In [None]:
empty_df = spark.createDataFrame([['']]).toDF('text')

pipeline_model = nlp_pipeline.fit(empty_df)

lmodel = LightPipeline(pipeline_model)

###Visualize results

In [None]:
res = lmodel.fullAnnotate(text)

print ('Original:', text, '\n\n')

print ('Translated:\n')
for sentence in res[0]['translation']:
  print (sentence.result)

###Translate English text to Italian

In [None]:
marian = MarianTransformer.pretrained("opus_mt_en_it", "xx").setInputCols(["sentences"]).setOutputCol("translation")

nlp_pipeline = Pipeline(stages=[
    documentAssembler, 
    sentencerDL, marian
])

In [None]:
text = "Marian is an efficient, free Neural Machine Translation software"

In [None]:
empty_df = spark.createDataFrame([['']]).toDF('text')

pipeline_model = nlp_pipeline.fit(empty_df)

lmodel = LightPipeline(pipeline_model)

In [None]:
res = lmodel.fullAnnotate(text)

print ('Original:', text, '\n\n')

print ('Translated:\n')
for sentence in res[0]['translation']:
  print (sentence.result)

##Other Translation Pipelines

In [None]:
text = "L’Istituto di Calcolo e Reti ad Alte Prestazioni è un Istituto del Consiglio Nazionale delle Ricerche"

In [None]:
# Italian to German
it_de = PretrainedPipeline("translate_it_de", lang="xx")

In [None]:
res = it_de.annotate(text)

In [None]:
print(res['sentence'])
print(res['translation'])

In [None]:
# Italian to Spanish
it_es = PretrainedPipeline("translate_it_es", lang="xx")

In [None]:
res = it_es.annotate(text)

In [None]:
print(res['sentence'])
print(res['translation'])

In [None]:
#spark.stop()